aboutsummaryrefslogtreecommitdiff
path: root/bridges/XPathBridge.php
blob: 35ec6ad100152f3e89a603fbe1c6cf0eb41e0643 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
<?php

class XPathBridge extends XPathAbstract
{
    const NAME = 'XPathBridge';
    const URI = 'https://github.com/rss-bridge/rss-bridge';
    const DESCRIPTION
        = 'Parse any webpage using <a href="https://devhints.io/xpath" target="_blank">XPath expressions</a>';
    const MAINTAINER = 'Niehztog';
    const PARAMETERS = [
        '' => [

            'url' => [
                'name' => 'Enter web page URL',
                'title' => <<<"EOL"
You can specify any website URL which serves data suited for display in RSS feeds
(for example a news blog).
EOL, 'type' => 'text',
                'exampleValue' => 'https://news.blizzard.com/en-en',
                'defaultValue' => 'https://news.blizzard.com/en-en',
                'required' => true
            ],

            'item' => [
                'name' => 'Item selector',
                'title' => <<<"EOL"
Enter an XPath expression matching a list of dom nodes, each node containing one
feed article item in total (usually a surrounding &lt;div&gt; or &lt;span&gt; tag). This will
be the context nodes for all of the following expressions. This expression usually
starts with a single forward slash.
EOL, 'type' => 'text',
                'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
                'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
                'required' => true
            ],

            'title' => [
                'name' => 'Item title selector',
                'title' => <<<"EOL"
This expression should match a node contained within each article item node
containing the article headline. It should start with a dot followed by two
forward slashes, referring to any descendant nodes of the article item node.
EOL, 'type' => 'text',
                'exampleValue' => './/div/div[2]/h2',
                'defaultValue' => './/div/div[2]/h2',
                'required' => true
            ],

            'content' => [
                'name' => 'Item description selector',
                'title' => <<<"EOL"
This expression should match a node contained within each article item node
containing the article content or description. It should start with a dot
followed by two forward slashes, referring to any descendant nodes of the
article item node.
EOL, 'type' => 'text',
                'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
                'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
                'required' => false
            ],

            'raw_content' => [
                'name' => 'Use raw item description',
                'title' => <<<"EOL"
                Whether to use the raw item description or to replace certain characters with
                special significance in HTML by HTML entities (using the PHP function htmlspecialchars).
                EOL,
                'type' => 'checkbox',
                'defaultValue' => false,
                'required' => false
            ],

            'uri' => [
                'name' => 'Item URL selector',
                'title' => <<<"EOL"
This expression should match a node's attribute containing the article URL
(usually the href attribute of an &lt;a&gt; tag). It should start with a dot
followed by two forward slashes, referring to any descendant nodes of
the article item node. Attributes can be selected by prepending an @ char
before the attributes name.
EOL, 'type' => 'text',
                'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
                'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
                'required' => false
            ],

            'author' => [
                'name' => 'Item author selector',
                'title' => <<<"EOL"
This expression should match a node contained within each article item
node containing the article author's name. It should start with a dot
followed by two forward slashes, referring to any descendant nodes of
the article item node.
EOL, 'type' => 'text',
                'required' => false
            ],

            'timestamp' => [
                'name' => 'Item date selector',
                'title' => <<<"EOL"
This expression should match a node or node's attribute containing the
article timestamp or date (parsable by PHP's strtotime function). It
should start with a dot followed by two forward slashes, referring to
any descendant nodes of the article item node. Attributes can be
selected by prepending an @ char before the attributes name.
EOL, 'type' => 'text',
                'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
                'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
                'required' => false
            ],

            'enclosures' => [
                'name' => 'Item image selector',
                'title' => <<<"EOL"
This expression should match a node's attribute containing an article
image URL (usually the src attribute of an &lt;img&gt; tag or a style
attribute). It should start with a dot followed by two forward slashes,
referring to any descendant nodes of the article item node. Attributes
can be selected by prepending an @ char before the attributes name.
EOL, 'type' => 'text',
                'exampleValue' => './/div[@class="ArticleListItem-image"]/@style',
                'defaultValue' => './/div[@class="ArticleListItem-image"]/@style',
                'required' => false
            ],

            'categories' => [
                'name' => 'Item category selector',
                'title' => <<<"EOL"
This expression should match a node or node's attribute contained
within each article item node containing the article category. This
could be inside &lt;div&gt; or &lt;span&gt; tags or sometimes be hidden
in a data attribute. It should start with a dot followed by two
forward slashes, referring to any descendant nodes of the article
item node. Attributes can be selected by prepending an @ char
before the attributes name.
EOL, 'type' => 'text',
                'exampleValue' => './/div[@class="ArticleListItem-label"]',
                'defaultValue' => './/div[@class="ArticleListItem-label"]',
                'required' => false
            ],

            'fix_encoding' => [
                'name' => 'Fix encoding',
                'title' => <<<"EOL"
Check this to fix feed encoding by invoking PHP's utf8_decode
function on all extracted texts. Try this in case you see "broken" or
"weird" characters in your feed where you'd normally expect umlauts
or any other non-ascii characters.
EOL, 'type' => 'checkbox',
                'required' => false
            ],

        ]
    ];

    /**
     * Source Web page URL (should provide either HTML or XML content)
     * @return string
     */
    protected function getSourceUrl()
    {
        return $this->encodeUri($this->getInput('url'));
    }

    /**
     * XPath expression for extracting the feed items from the source page
     * @return string
     */
    protected function getExpressionItem()
    {
        return urldecode($this->getInput('item'));
    }

    /**
     * XPath expression for extracting an item title from the item context
     * @return string
     */
    protected function getExpressionItemTitle()
    {
        return urldecode($this->getInput('title'));
    }

    /**
     * XPath expression for extracting an item's content from the item context
     * @return string
     */
    protected function getExpressionItemContent()
    {
        return urldecode($this->getInput('content'));
    }

    /**
     * Use raw item content
     * @return bool
     */
    protected function getSettingUseRawItemContent(): bool
    {
        return $this->getInput('raw_content');
    }

    /**
     * XPath expression for extracting an item link from the item context
     * @return string
     */
    protected function getExpressionItemUri()
    {
        return urldecode($this->getInput('uri'));
    }

    /**
     * XPath expression for extracting an item author from the item context
     * @return string
     */
    protected function getExpressionItemAuthor()
    {
        return urldecode($this->getInput('author'));
    }

    /**
     * XPath expression for extracting an item timestamp from the item context
     * @return string
     */
    protected function getExpressionItemTimestamp()
    {
        return urldecode($this->getInput('timestamp'));
    }

    /**
     * XPath expression for extracting item enclosures (media content like
     * images or movies) from the item context
     * @return string
     */
    protected function getExpressionItemEnclosures()
    {
        return urldecode($this->getInput('enclosures'));
    }

    /**
     * XPath expression for extracting an item category from the item context
     * @return string
     */
    protected function getExpressionItemCategories()
    {
        return urldecode($this->getInput('categories'));
    }

    /**
     * Fix encoding
     * @return bool
     */
    protected function getSettingFixEncoding(): bool
    {
        return $this->getInput('fix_encoding');
    }

    /**
     * Fixes URL encoding issues in input URL's
     * @param $uri
     * @return string|string[]
     */
    private function encodeUri($uri)
    {
        if (
            strpos($uri, 'https%3A%2F%2F') === 0
            || strpos($uri, 'http%3A%2F%2F') === 0
        ) {
            $uri = urldecode($uri);
        }

        $uri = str_replace('|', '%7C', $uri);

        return $uri;
    }
}