aboutsummaryrefslogtreecommitdiff
path: root/bridges/TheHackerNewsBridge.php
blob: 1f9c34c059b35d8ef68530cc905566648794bd24 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
<?php

class TheHackerNewsBridge extends BridgeAbstract
{
    const MAINTAINER = 'ORelio';
    const NAME = 'The Hacker News Bridge';
    const URI = 'https://thehackernews.com/';
    const DESCRIPTION = 'Cyber Security, Hacking, Technology News.';

    public function collectData()
    {
        $html = getSimpleHTMLDOM($this->getURI());
        $html = convertLazyLoading($html);
        $html = defaultLinkTo($html, $this->getURI());
        $limit = 0;

        foreach ($html->find('div.body-post') as $element) {
            if ($limit >= 5) {
                break;
            }

            // Author (not present on home page)
            $article_author = null;

            // Title
            $article_title = $element->find('h2.home-title', 0)->plaintext;

            // Date
            $article_timestamp = time();
            $calendar = $element->find('i.icon-calendar', 0);
            if ($calendar) {
                $article_timestamp = strtotime(
                    extractFromDelimiters(
                        $calendar->parent()->outertext,
                        '</i>',
                        '</span>'
                    )
                );
            }

            // Thumbnail
            $article_thumbnail = [];
            if (is_object($element->find('img', 0))) {
                $article_thumbnail = [ $element->find('img', 0)->src ];
            }

            // Content (truncated)
            $article_content = $element->find('div.home-desc', 0)->plaintext;

            // Now try expanding article
            $article_url = $element->find('a.story-link', 0)->href;
            $article_html = getSimpleHTMLDOMCached($article_url);
            if ($article_html) {
                // Content (expanded and cleaned)
                $article_body = $article_html->find('div.articlebody', 0);
                if ($article_body) {
                    $article_body = convertLazyLoading($article_body);
                    $article_body = defaultLinkTo($article_body, $article_url);
                    $header_img = $article_body->find('img', 0);
                    if ($header_img) {
                        $header_img->parent->style = '';
                    }
                    foreach ($article_body->find('center.cf') as $center_ad) {
                        $center_ad->outertext = '';
                    }
                    $article_content = $article_body->innertext;
                }
                // Author
                $spans_author = $article_html->find('span.author');
                if (count($spans_author) > 0) {
                    $article_author = $spans_author[array_key_last($spans_author)]->plaintext;
                }
            }

            $item = [];
            $item['uri'] = $article_url;
            $item['title'] = $article_title;
            if (!empty($article_author)) {
                $item['author'] = $article_author;
            }
            $item['enclosures'] = $article_thumbnail;
            $item['timestamp'] = $article_timestamp;
            $item['content'] = trim($article_content);
            $this->items[] = $item;
            $limit++;
        }
    }
}