1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
<?php
/**
* An extension of the previous SexactuBridge to cover the whole GQMagazine.
* This one taks a page (as an example sexe/news or journaliste/maia-mazaurette) which is to be configured,
* reads all the articles visible on that page, and make a stream out of it.
* @author nicolas-delsaux
*
*/
class GQMagazineBridge extends BridgeAbstract
{
const MAINTAINER = 'Riduidel';
const NAME = 'GQMagazine';
// URI is no more valid, since we can address the whole gq galaxy
const URI = 'https://www.gqmagazine.fr';
const CACHE_TIMEOUT = 7200; // 2h
const DESCRIPTION = 'GQMagazine section extractor bridge. This bridge allows you get only a specific section.';
const DEFAULT_DOMAIN = 'www.gqmagazine.fr';
const PARAMETERS = [ [
'domain' => [
'name' => 'Domain to use',
'required' => true,
'defaultValue' => self::DEFAULT_DOMAIN
],
'page' => [
'name' => 'Initial page to load',
'required' => true,
'exampleValue' => 'sexe/news'
],
'limit' => self::LIMIT,
]];
const REPLACED_ATTRIBUTES = [
'href' => 'href',
'src' => 'src',
'data-original' => 'src'
];
const POSSIBLE_TITLES = [
'h2',
'h3'
];
private function getDomain()
{
$domain = $this->getInput('domain');
if (empty($domain)) {
$domain = self::DEFAULT_DOMAIN;
}
if (strpos($domain, '://') === false) {
$domain = 'https://' . $domain;
}
return $domain;
}
public function getURI()
{
return $this->getDomain() . '/' . $this->getInput('page');
}
private function findTitleOf($link)
{
foreach (self::POSSIBLE_TITLES as $tag) {
$title = $link->parent()->find($tag, 0);
if ($title !== null) {
if ($title->plaintext !== null) {
return $title->plaintext;
}
}
}
}
public function collectData()
{
$html = getSimpleHTMLDOM($this->getURI());
// Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content !
$main = $html->find('main', 0);
$limit = $this->getInput('limit') ?? 10;
foreach ($main->find('a') as $link) {
if (count($this->items) >= $limit) {
break;
}
$uri = $link->href;
$date = $link->parent()->find('time', 0);
$item = [];
$author = $link->parent()->find('span[itemprop=name]', 0);
if ($author !== null) {
$item['author'] = $author->plaintext;
$item['title'] = $this->findTitleOf($link);
switch (substr($uri, 0, 1)) {
case 'h': // absolute uri
$item['uri'] = $uri;
break;
case '/': // domain relative uri
$item['uri'] = $this->getDomain() . $uri;
break;
default:
$item['uri'] = $this->getDomain() . '/' . $uri;
}
$article = $this->loadFullArticle($item['uri']);
if ($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
} else {
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
}
$short_date = $date->datetime;
$item['timestamp'] = strtotime($short_date);
$this->items[] = $item;
}
}
}
/**
* Loads the full article and returns the contents
* @param $uri The article URI
* @return The article content
*/
private function loadFullArticle($uri)
{
$html = getSimpleHTMLDOMCached($uri);
return $html->find('article', 0);
}
/**
* Replaces all relative URIs with absolute ones
* @param $element A simplehtmldom element
* @return The $element->innertext with all URIs replaced
*/
private function replaceUriInHtmlElement($element)
{
$returned = $element->innertext;
foreach (self::REPLACED_ATTRIBUTES as $initial => $final) {
$returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned);
}
return $returned;
}
}
|