aboutsummaryrefslogtreecommitdiff
path: root/bridges/TheHackerNewsBridge.php
blob: dfe07543fa54dad1da97a18a6b6f249898568f12 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
<?php

class TheHackerNewsBridge extends BridgeAbstract
{
    const MAINTAINER = 'ORelio';
    const NAME = 'The Hacker News Bridge';
    const URI = 'https://thehackernews.com/';
    const DESCRIPTION = 'Cyber Security, Hacking, Technology News.';

    public function collectData()
    {
        $html = getSimpleHTMLDOM($this->getURI());
        $limit = 0;

        foreach ($html->find('div.body-post') as $element) {
            if ($limit < 5) {
                $article_url = $element->find('a.story-link', 0)->href;
                $article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext);
                $article_author = str_replace('&#59396;', '', $article_author);
                $article_title = $element->find('h2.home-title', 0)->plaintext;

                //Date without time
                $article_timestamp = strtotime(
                    extractFromDelimiters(
                        $element->find('i.icon-calendar', 0)->parent()->outertext,
                        '</i>',
                        '<span>'
                    )
                );

                //Article thumbnail in lazy-loading image
                if (is_object($element->find('img[data-echo]', 0))) {
                    $article_thumbnail = [
                        extractFromDelimiters(
                            $element->find('img[data-echo]', 0)->outertext,
                            "data-echo='",
                            "'"
                        )
                    ];
                } else {
                    $article_thumbnail = [];
                }

                if ($article = getSimpleHTMLDOMCached($article_url)) {
                    //Article body
                    $contents = $article->find('div.articlebody', 0)->innertext;
                    $contents = stripRecursiveHtmlSection($contents, 'div', '<div class="ad_');
                    $contents = stripWithDelimiters($contents, 'id="google_ads', '</iframe>');
                    $contents = stripWithDelimiters($contents, '<script', '</script>');

                    //Date with time
                    if (is_object($article->find('meta[itemprop=dateModified]', 0))) {
                        $article_timestamp = strtotime(
                            extractFromDelimiters(
                                $article->find('meta[itemprop=dateModified]', 0)->outertext,
                                "content='",
                                "'"
                            )
                        );
                    }
                } else {
                    $contents = 'Could not request TheHackerNews: ' . $article_url;
                }

                $item = [];
                $item['uri'] = $article_url;
                $item['title'] = $article_title;
                $item['author'] = $article_author;
                $item['enclosures'] = $article_thumbnail;
                $item['timestamp'] = $article_timestamp;
                $item['content'] = trim($contents);
                $this->items[] = $item;
                $limit++;
            }
        }
    }
}