aboutsummaryrefslogtreecommitdiff
path: root/bridges/ArsTechnicaBridge.php
blob: ac722dc91f06a4cb212190c3f97aa5493a2c92ef (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
<?php

class ArsTechnicaBridge extends FeedExpander
{
    const MAINTAINER = 'phantop';
    const NAME = 'Ars Technica';
    const URI = 'https://arstechnica.com/';
    const DESCRIPTION = 'Returns the latest articles from Ars Technica';
    const PARAMETERS = [[
            'section' => [
                'name' => 'Site section',
                'type' => 'list',
                'defaultValue' => 'index',
                'values' => [
                    'All' => 'index',
                    'Apple' => 'apple',
                    'Board Games' => 'cardboard',
                    'Cars' => 'cars',
                    'Features' => 'features',
                    'Gaming' => 'gaming',
                    'Information Technology' => 'technology-lab',
                    'Science' => 'science',
                    'Staff Blogs' => 'staff-blogs',
                    'Tech Policy' => 'tech-policy',
                    'Tech' => 'gadgets',
                    ]
            ]
    ]];

    public function collectData()
    {
        $url = 'https://feeds.arstechnica.com/arstechnica/' . $this->getInput('section');
        $this->collectExpandableDatas($url, 10);
    }

    protected function parseItem(array $item)
    {
        $item_html = getSimpleHTMLDOMCached($item['uri']);
        $item_html = defaultLinkTo($item_html, self::URI);

        $content = '';
        $header = $item_html->find('article header', 0);
        $leading = $header->find('p[class*=leading]', 0);
        if ($leading != null) {
            $content .= '<p>' . $leading->innertext . '</p>';
        }
        $intro_image = $header->find('img.intro-image', 0);
        if ($intro_image != null) {
            $content .= '<figure>' . $intro_image;

            $image_caption = $header->find('.caption .caption-content', 0);
            if ($image_caption != null) {
                $content .= '<figcaption>' . $image_caption->innertext . '</figcaption>';
            }
            $content .= '</figure>';
        }

        foreach ($item_html->find('.post-content') as $content_tag) {
            $content .= $content_tag->innertext;
        }

        $item['content'] = str_get_html($content);

        $parsely = $item_html->find('[name="parsely-page"]', 0);
        $parsely_json = json_decode(html_entity_decode($parsely->content), true);
        $item['categories'] = $parsely_json['tags'];

        // Some lightboxes are nested in figures. I'd guess that's a
        // bug in the website
        foreach ($item['content']->find('figure div div.ars-lightbox') as $weird_lightbox) {
            $weird_lightbox->parent->parent->outertext = $weird_lightbox;
        }

        // It's easier to reconstruct the whole thing than remove
        // duplicate reactive tags
        foreach ($item['content']->find('.ars-lightbox') as $lightbox) {
            $lightbox_content = '';
            foreach ($lightbox->find('.ars-lightbox-item') as $lightbox_item) {
                $img = $lightbox_item->find('img', 0);
                if ($img != null) {
                    $lightbox_content .= '<figure>' . $img;
                    $caption = $lightbox_item->find('div.pswp-caption-content', 0);
                    if ($caption != null) {
                        $credit = $lightbox_item->find('div.ars-gallery-caption-credit', 0);
                        if ($credit != null) {
                            $credit->innertext = 'Credit: ' . $credit->innertext;
                        }
                        $lightbox_content .= '<figcaption>' . $caption->innertext . '</figcaption>';
                    }
                    $lightbox_content .= '</figure>';
                }
            }
            $lightbox->innertext = $lightbox_content;
        }

        // remove various ars advertising
        foreach ($item['content']->find('.ars-interlude-container') as $ad) {
            $ad->remove();
        }
        foreach ($item['content']->find('.toc-container') as $toc) {
            $toc->remove();
        }

        // Mostly YouTube videos
        $iframes = $item['content']->find('iframe');
        foreach ($iframes as $iframe) {
            $iframe->outertext = '<a href="' . $iframe->src . '">' . $iframe->src . '</a>';
        }
        // This fixed padding around the former iframes and actual inline videos
        foreach ($item['content']->find('div[style*=aspect-ratio]') as $styled) {
            $styled->removeAttribute('style');
        }

        $item['content'] = backgroundToImg($item['content']);
        $item['uid'] = strval($parsely_json['post_id']);
        return $item;
    }
}