aboutsummaryrefslogtreecommitdiff
path: root/bridges/ZeitBridge.php
blob: d4d66a1cb43d2692faebd94ae3008676df39af50 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
<?php

class ZeitBridge extends FeedExpander
{
    const MAINTAINER = 'Mynacol';
    const NAME = 'Zeit Online Bridge';
    const URI = 'https://www.zeit.de/';
    const CACHE_TIMEOUT = 1800; // 30min
    const DESCRIPTION = 'Returns the full articles instead of only the intro';
    const PARAMETERS = [[
        'category' => [
            'name' => 'Category',
            'type' => 'list',
            'values' => [
                'Startseite'
                => 'https://newsfeed.zeit.de/index',
                'Politik'
                => 'https://newsfeed.zeit.de/politik/index',
                'Wirtschaft'
                => 'https://newsfeed.zeit.de/wirtschaft/index',
                'Gesellschaft'
                => 'https://newsfeed.zeit.de/gesellschaft/index',
                'Kultur'
                => 'https://newsfeed.zeit.de/kultur/index',
                'Wissen'
                => 'https://newsfeed.zeit.de/wissen/index',
                'Digital'
                => 'https://newsfeed.zeit.de/digital/index',
                'ZEIT Campus ONLINE'
                => 'https://newsfeed.zeit.de/campus/index',
                'ZEIT ONLINE Arbeit'
                => 'https://newsfeed.zeit.de/arbeit/index',
                'ZEIT Magazin ONLINE'
                => 'https://newsfeed.zeit.de/zeit-magazin/index',
                'Entdecken'
                => 'https://newsfeed.zeit.de/entdecken/index',
                'Mobilität'
                => 'https://newsfeed.zeit.de/mobilitaet/index',
                'Sport'
                => 'https://newsfeed.zeit.de/sport/index',
                'Alle Inhalte'
                => 'https://newsfeed.zeit.de/all'
            ]
        ],
        'limit' => [
            'name' => 'Limit',
            'type' => 'number',
            'required' => false,
            'title' => 'Specify number of full articles to return',
            'defaultValue' => 5
        ]
    ]];

    public function collectData()
    {
        $url = $this->getInput('category');
        $limit = $this->getInput('limit') ?: 5;

        $this->collectExpandableDatas($url, $limit);
    }

    protected function parseItem(array $item)
    {
        $item['enclosures'] = [];

        $headers = [
            'Cookie: zonconsent=' . date('Y-m-d\TH:i:s.v\Z'),
        ];

        // one-page article
        $article = getSimpleHTMLDOM($item['uri'], $headers);
        if ($article->find('a[href="' . $item['uri'] . '/komplettansicht"]', 0)) {
            $item['uri'] .= '/komplettansicht';
            $article = getSimpleHTMLDOM($item['uri'], $headers);
        }

        $article = defaultLinkTo($article, $item['uri']);
        $item = $this->parseArticle($item, $article);

        return $item;
    }

    private function parseArticle($item, $article)
    {
        $article = $article->find('main', 0);

        // remove known bad elements
        foreach (
            $article->find(
                'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge,
                .article-heading__container--podcast, .podcast-player__image, div[data-paywall],
                .js-embed-consent, script, nav, .article-flexible-toc__subheading-link, .faq-link'
            ) as $bad
        ) {
            $bad->remove();
        }
        // reload html, as remove() is buggy
        $article = str_get_html($article->outertext);

        // podcast audio, if available
        $podcast_src = $article->find('.article-heading__podcast audio[src]', 0);
        if ($podcast_src) {
            $item['enclosures'][] = $podcast_src->src;
        }

        // full res images
        foreach ($article->find('img[data-src]') as $img) {
            $img->src = $img->getAttribute('data-src');
            $item['enclosures'][] = $img->src;
        }

        // authors
        $authors = $article->find('*[itemtype*="schema.org/Person"]') ?? $article->find('.metadata__source');
        if ($authors) {
            $item['author'] = implode(', ', array_map(function ($e) {
                return trim($e->plaintext);
            }, $authors));
        }

        $item['content'] = '';

        // summary
        $summary = $article->find('.summary');
        if ($summary) {
            $item['content'] .= implode('', $summary);
        }

        // header image
        $headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('.article-header', 0) ?? $article->find('header', 0);
        if ($headerimg) {
            $item['content'] .= implode('', $headerimg->find('img[src], figcaption'));
        }

        // article content
        $pages = $article->find('.article-page');

        if ($pages) {
            foreach ($pages as $page) {
                $elements = $page->find('p, ul, ol, h2, figure.article__media img[src], figure.article__media figcaption, figure.quote');
                $item['content'] .= implode('', $elements);
            }
        }

        return $item;
    }
}