aboutsummaryrefslogtreecommitdiff
path: root/bridges/RobinhoodSnacksBridge.php
blob: 1c8f729152561a957f774d63e87f80d1d2fa84d2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
<?php

class RobinhoodSnacksBridge extends BridgeAbstract
{
    const MAINTAINER = 'johnpc';
    const NAME = 'Robinhood Snacks Newsletter';
    const URI = 'https://snacks.robinhood.com/newsletters/';
    const CACHE_TIMEOUT = 86400; // 24h
    const DESCRIPTION = 'Returns newsletters from Robinhood Snacks';

    // Work around 403 by pretending to be a legit browser
    const FAKE_HEADERS = [
        'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0',
        'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language: es-ES,en-US;q=0.7,en;q=0.3',
        'Accept-Encoding: gzip, deflate, br',
        'Connection: keep-alive',
        'Upgrade-Insecure-Requests: 1',
        'Sec-Fetch-Dest: document',
        'Sec-Fetch-Mode: navigate',
        'Sec-Fetch-Site: none',
        'Sec-Fetch-User: ?1',
        'Pragma: no-cache',
        'Cache-Control: no-cache',
        'TE: trailers'
    ];

    public function collectData()
    {
        $html = getSimpleHTMLDOM(self::URI, self::FAKE_HEADERS);
        $html = defaultLinkTo($html, $this->getURI());

        $elements = $html->find('#__next > div > div > div > div > a');

        foreach ($elements as $element) {
            if ($element->href === 'https://snacks.robinhood.com/newsletters/page/2/') {
                continue;
            }

            $content = $element->find('div > div', 2);

            // Remove element that is not parsed (span with weekly tag)
            $unwanted_selector = 'span';
            foreach ($content->find($unwanted_selector) as $found) {
                $found->outertext = '';
            }

            $title = $content->find('div', 0)->innertext;
            $timestamp = strtotime($content->find('div', 1)->innertext);
            $uri = $element->href;

            $this->items[] = [
                'uri' => $uri,
                'title' => $title,
                'timestamp' => $timestamp,
                'content' => $this->getArticleContent($uri)
            ];
        }
    }

    private function getArticleContent($uri)
    {
        $article_html = getSimpleHTMLDOMCached($uri, self::CACHE_TIMEOUT, self::FAKE_HEADERS);
        if (!$article_html) {
            return '';
        }

        $content = $article_html->find('#__next > div > div > div > span', 0);
        $content->removeChild($content->find('div', 0));
        $content->removeChild($content->find('h1', 0));
        $content->removeChild($content->find('img', 1));

        // Remove elements that are not part of article content
        $unwanted_selector = 'style';
        foreach ($content->find($unwanted_selector) as $found) {
            $found->outertext = '';
        }

        // Images cleanup
        $already_displayed_pictures = [];
        foreach ($content->find('img') as $found) {
            // Skip loader images
            if (str_contains($found->src, 'data:image/gif;base64')) {
                $found->outertext = '';
                continue;
            }

            // Skip multiple images with same src
            // and remove duplicated image description
            if (in_array($found->src, $already_displayed_pictures)) {
                $found->parent->parent->parent->outertext = '';
                $found->parent->parent->parent->nextSibling()->nextSibling()->outertext = '';
                continue;
            }

            // Remove srcset attribute
            $found->removeAttribute('srcset');

            // If relative img, fix path
            if (str_starts_with($found->src, '/_next')) {
                $found->setAttribute('src', 'https://snacks.robinhood.com' . $found->getAttribute('src'));
            }

            $already_displayed_pictures[] = $found->src;
        }

        $content_text = $content->innertext;

        // Remove noscript tag to display images
        $content_text = str_replace('<noscript>', '', $content_text);

        return $content_text;
    }
}