aboutsummaryrefslogtreecommitdiff
path: root/bridges/SeznamZpravyBridge.php
blob: f052ed1caf0ede764bc20790240b4897edf8389b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
<?php

class SeznamZpravyBridge extends BridgeAbstract
{
    const NAME = 'Seznam Zprávy Bridge';
    const URI = 'https://seznamzpravy.cz';
    const DESCRIPTION = 'Returns newest stories from Seznam Zprávy';
    const MAINTAINER = 'thezeroalpha';
    const PARAMETERS = [
        'By Author' => [
            'author' => [
                'name' => 'Author String',
                'type' => 'text',
                'required' => true,
                'title' => 'The dash-separated author string, as shown in the URL bar.',
                'pattern' => '[a-z]+-[a-z]+-[0-9]+',
                'exampleValue' => 'radek-nohl-1'
            ],
        ]
    ];

    private $feedName;

    public function getName()
    {
        if (isset($this->feedName)) {
            return $this->feedName;
        }
        return parent::getName();
    }

    public function collectData()
    {
        $ONE_DAY = 86500;
        switch ($this->queriedContext) {
            case 'By Author':
                $url = 'https://www.seznamzpravy.cz/autor/';
                $selectors = [
                'breadcrumbs' => 'div[data-dot=ogm-breadcrumb-navigation]',
                'articleList' => 'ul.ogm-document-timeline-page li article[data-dot=mol-timeline-item]',
                'articleTitle' => 'a[data-dot=mol-article-card-title]',
                'articleDM' => 'span.mol-formatted-date__date',
                'articleTime' => 'span.mol-formatted-date__time',
                'articleContent' => 'div[data-dot=ogm-article-content]',
                'articleImage' => 'div[data-dot=ogm-main-media] img',
                'articleParagraphs' => 'div[data-dot=mol-paragraph]'
                ];

                $html = getSimpleHTMLDOMCached($url . $this->getInput('author'), $ONE_DAY);
                $mainBreadcrumbs = $html->find($selectors['breadcrumbs'], 0)
                or returnServerError('Could not get breadcrumbs for: ' . $this->getURI());

                $author = $mainBreadcrumbs->last_child()->plaintext
                or returnServerError('Could not get author for: ' . $this->getURI());

                $this->feedName = $author . ' - Seznam Zprávy';

                $articles = $html->find($selectors['articleList'])
                or returnServerError('Could not find articles for: ' . $this->getURI());

                foreach ($articles as $article) {
                    // Get article URL
                    $titleLink = $article->find($selectors['articleTitle'], 0)
                    or returnServerError('Could not find title for: ' . $this->getURI());
                    $articleURL = $titleLink->href;

                    $articleContentHTML = getSimpleHTMLDOMCached($articleURL, $ONE_DAY);

                    // Article header image
                    $articleImageElem = $articleContentHTML->find($selectors['articleImage'], 0);

                    // Article text content
                    $contentElem = $articleContentHTML->find($selectors['articleContent'], 0)
                    or returnServerError('Could not get article content for: ' . $articleURL);
                    $contentParagraphs = $contentElem->find($selectors['articleParagraphs'])
                    or returnServerError('Could not find paragraphs for: ' . $articleURL);

                    // If the article has an image, put that image at the start
                    $contentInitialValue = isset($articleImageElem) ? $articleImageElem->outertext : '';
                    $contentText = array_reduce($contentParagraphs, function ($s, $elem) {
                        return $s . $elem->innertext;
                    }, $contentInitialValue);

                    // Article categories
                    $breadcrumbsElem = $articleContentHTML->find($selectors['breadcrumbs'], 0)
                        or returnServerError('Could not find breadcrumbs for: ' . $articleURL);
                    $breadcrumbs = $breadcrumbsElem->children();
                    $numBreadcrumbs = count($breadcrumbs);
                    $categories = [];
                    foreach ($breadcrumbs as $cat) {
                        if (--$numBreadcrumbs <= 0) {
                            break;
                        }
                        $categories[] = trim($cat->plaintext);
                    }

                    // Article date & time
                    $articleTimeElem = $article->find($selectors['articleTime'], 0)
                    or returnServerError('Could not find article time for: ' . $articleURL);
                    $articleTime = $articleTimeElem->plaintext;

                    $articleDMElem = $article->find($selectors['articleDM'], 0);
                    if (isset($articleDMElem)) {
                        $articleDMText = $articleDMElem->plaintext;
                    } else {
                        // If there is no date but only a time, the article was published today
                        $articleDMText = date('d.m.');
                    }
                    $articleDMY = preg_replace('/[^0-9\.]/', '', $articleDMText) . date('Y');

                    // Add article to items, potentially with header image as enclosure
                    $item = [
                    'title' => $titleLink->plaintext,
                    'uri' => $titleLink->href,
                    'timestamp' => strtotime($articleDMY . ' ' . $articleTime),
                    'author' => $author,
                    'content' => $contentText,
                    'categories' => $categories
                    ];
                    if (isset($articleImageElem)) {
                        $item['enclosures'] = ['https:' . $articleImageElem->src];
                    }
                    $this->items[] = $item;
                }
                break;
        }
        $this->items[] = $item;
    }
}