aboutsummaryrefslogtreecommitdiff
path: root/bridges/FolhaDeSaoPauloBridge.php
blob: 27713a41b45bea4f53111df627e008af39457367 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
<?php

class FolhaDeSaoPauloBridge extends FeedExpander
{
    const MAINTAINER = 'somini';
    const NAME = 'Folha de São Paulo';
    const URI = 'https://www1.folha.uol.com.br';
    const DESCRIPTION = 'Returns the newest posts from Folha de São Paulo (full text)';
    const PARAMETERS = [
        [
            'feed' => [
                'name' => 'Feed sub-URL',
                'type' => 'text',
                'required' => true,
                'title' => 'Select the sub-feed (see https://www1.folha.uol.com.br/feed/)',
                'exampleValue' => 'emcimadahora/rss091.xml',
            ],
            'amount' => [
                'name' => 'Amount of items to fetch',
                'type' => 'number',
                'defaultValue' => 15,
            ],
            'deep_crawl' => [
                'name' => 'Deep Crawl',
                'description' => 'Crawl each item "deeply", that is, return the article contents',
                'type' => 'checkbox',
                'defaultValue' => true,
            ],
        ]
    ];

    protected function parseItem(array $item)
    {
        if ($this->getInput('deep_crawl')) {
            $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
            if ($articleHTMLContent) {
                foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) {
                    $toRemove->innertext = '';
                }
                $item_content = $articleHTMLContent->find('div.c-news__body', 0);
                if ($item_content) {
                    $text = $item_content->innertext;
                    $text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em><ul><li>');
                    $item['content'] = $text;
                    $item['uri'] = explode('*', $item['uri'])[1];
                }
            }
        } else {
            $item['uri'] = explode('*', $item['uri'])[1];
        }

        return $item;
    }

    public function collectData()
    {
        $feed_input = $this->getInput('feed');
        if (substr($feed_input, 0, strlen(self::URI)) === self::URI) {
            $feed_url = $feed_input;
        } else {
            /* TODO: prepend `/` if missing */
            $feed_url = self::URI . '/' . $this->getInput('feed');
        }
        $limit = $this->getInput('amount');
        $this->collectExpandableDatas($feed_url, $limit);
    }
}