diff options
Diffstat (limited to 'bridges/SchweinfurtBuergerinformationenBridge.php')
-rw-r--r-- | bridges/SchweinfurtBuergerinformationenBridge.php | 249 |
1 files changed, 130 insertions, 119 deletions
diff --git a/bridges/SchweinfurtBuergerinformationenBridge.php b/bridges/SchweinfurtBuergerinformationenBridge.php index 1cee949a..c7c935fd 100644 --- a/bridges/SchweinfurtBuergerinformationenBridge.php +++ b/bridges/SchweinfurtBuergerinformationenBridge.php @@ -1,121 +1,132 @@ <?php -class SchweinfurtBuergerinformationenBridge extends BridgeAbstract { - const MAINTAINER = 'mibe'; - const NAME = 'Schweinfurt Bürgerinformationen'; - const URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/index.html'; - const ARTICLE_URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/%d.html'; - const INDEX_CACHE_TIMEOUT = 10800; // 3h - const ARTICLE_CACHE_TIMEOUT = 21600; // 6h - const DESCRIPTION = 'Returns the latest news for citizens of Schweinfurt'; - const PARAMETERS = array( - array( - 'pages' => array( - 'name' => 'Number of pages', - 'type' => 'number', - 'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.', - 'exampleValue' => '1', - 'defaultValue' => '1', - ) - ) - ); - - public function getIcon() - { - return 'https://www.schweinfurt.de/__/images/favicon.ico'; - } - - public function collectData() - { - // Get number of pages to retrieve. One page is the minimum. - $pages = $this->getInput('pages'); - if (!is_int($pages) || $pages < 1) - $pages = 1; - - $articleIDs = array(); - - for($page = 0; $page < $pages; $page++) { - $newIDs = $this->getArticleIDsFromPage($page); - $articleIDs = array_merge($articleIDs, $newIDs); - } - - foreach($articleIDs as $articleID) { - $this->items[] = $this->generateItemFromArticle($articleID); - - if (Debug::isEnabled()) - break; - } - } - - private function getArticleIDsFromPage($page) - { - $url = sprintf(self::URI . '?art_pager=%d', $page); - $html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT) - or returnServerError('Could not retrieve ' . $url); - - $articles = $html->find('div.artikel-uebersicht'); - $articleIDs = array(); - - foreach($articles as $article) { - // The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_' - if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) { - $articleIDs[] = $match[1]; - } else - returnServerError('Couldn\'t determine article ID from index page.'); - } - - return $articleIDs; - } - - private function generateItemFromArticle($id) - { - $url = sprintf(self::ARTICLE_URI, $id); - $html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT) - or returnServerError('Could not retrieve ' . $url); - - $div = $html->find('div#artikel-detail', 0); - $divContent = $div->find('.c-content', 0); - $images = $divContent->find('img'); - - // Every external link has a little arrow symbol image attached to it. - // Remove this image. This has to be done before building $content. - foreach($images as $image) - if ($image->class == 'imgextlink') - $image->outertext = ''; - - $title = $div->find('.c-title', 0)->innertext; - $teaser = $div->find('.c-teaser', 0)->innertext; - $content = $divContent->innertext; - - // The title can contain HTML entities. These can be converted back - // to regular UTF-8 characters. - $title = html_entity_decode($title, ENT_HTML5, 'UTF-8'); - - // If there's a teaser, make it more eye-catching, - // so that it is clear, that this is not part of the actual content. - if (strlen(trim($teaser)) > 0) - $content = '<i><strong>' . $teaser . '</strong></i>' . $content; - - $item = array( - 'uri' => $url, - 'title' => $title, - 'content' => $content, - 'uid' => $id, - ); - - // Let's see if there are images in the content, and if yes, attach - // them as enclosures, but not images which are used for linking to an external site. - foreach($images as $image) - if ($image->class != 'imgextlink') - $item['enclosures'][] = $image->src; - - // Get the date of the article. Example: "zuletzt geändert: 26.05.2020" - $editDate = $div->find('div#edit', 0)->plaintext; - $editDate = substr($editDate, strrpos($editDate, ' ') + 1); - $editDate = DateTime::createFromFormat('d.m.Y', $editDate); - - if ($editDate !== false) - $item['timestamp'] = $editDate->getTimestamp(); - - return $item; - } + +class SchweinfurtBuergerinformationenBridge extends BridgeAbstract +{ + const MAINTAINER = 'mibe'; + const NAME = 'Schweinfurt Bürgerinformationen'; + const URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/index.html'; + const ARTICLE_URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/%d.html'; + const INDEX_CACHE_TIMEOUT = 10800; // 3h + const ARTICLE_CACHE_TIMEOUT = 21600; // 6h + const DESCRIPTION = 'Returns the latest news for citizens of Schweinfurt'; + const PARAMETERS = [ + [ + 'pages' => [ + 'name' => 'Number of pages', + 'type' => 'number', + 'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.', + 'exampleValue' => '1', + 'defaultValue' => '1', + ] + ] + ]; + + public function getIcon() + { + return 'https://www.schweinfurt.de/__/images/favicon.ico'; + } + + public function collectData() + { + // Get number of pages to retrieve. One page is the minimum. + $pages = $this->getInput('pages'); + if (!is_int($pages) || $pages < 1) { + $pages = 1; + } + + $articleIDs = []; + + for ($page = 0; $page < $pages; $page++) { + $newIDs = $this->getArticleIDsFromPage($page); + $articleIDs = array_merge($articleIDs, $newIDs); + } + + foreach ($articleIDs as $articleID) { + $this->items[] = $this->generateItemFromArticle($articleID); + + if (Debug::isEnabled()) { + break; + } + } + } + + private function getArticleIDsFromPage($page) + { + $url = sprintf(self::URI . '?art_pager=%d', $page); + $html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT) + or returnServerError('Could not retrieve ' . $url); + + $articles = $html->find('div.artikel-uebersicht'); + $articleIDs = []; + + foreach ($articles as $article) { + // The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_' + if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) { + $articleIDs[] = $match[1]; + } else { + returnServerError('Couldn\'t determine article ID from index page.'); + } + } + + return $articleIDs; + } + + private function generateItemFromArticle($id) + { + $url = sprintf(self::ARTICLE_URI, $id); + $html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT) + or returnServerError('Could not retrieve ' . $url); + + $div = $html->find('div#artikel-detail', 0); + $divContent = $div->find('.c-content', 0); + $images = $divContent->find('img'); + + // Every external link has a little arrow symbol image attached to it. + // Remove this image. This has to be done before building $content. + foreach ($images as $image) { + if ($image->class == 'imgextlink') { + $image->outertext = ''; + } + } + + $title = $div->find('.c-title', 0)->innertext; + $teaser = $div->find('.c-teaser', 0)->innertext; + $content = $divContent->innertext; + + // The title can contain HTML entities. These can be converted back + // to regular UTF-8 characters. + $title = html_entity_decode($title, ENT_HTML5, 'UTF-8'); + + // If there's a teaser, make it more eye-catching, + // so that it is clear, that this is not part of the actual content. + if (strlen(trim($teaser)) > 0) { + $content = '<i><strong>' . $teaser . '</strong></i>' . $content; + } + + $item = [ + 'uri' => $url, + 'title' => $title, + 'content' => $content, + 'uid' => $id, + ]; + + // Let's see if there are images in the content, and if yes, attach + // them as enclosures, but not images which are used for linking to an external site. + foreach ($images as $image) { + if ($image->class != 'imgextlink') { + $item['enclosures'][] = $image->src; + } + } + + // Get the date of the article. Example: "zuletzt geändert: 26.05.2020" + $editDate = $div->find('div#edit', 0)->plaintext; + $editDate = substr($editDate, strrpos($editDate, ' ') + 1); + $editDate = DateTime::createFromFormat('d.m.Y', $editDate); + + if ($editDate !== false) { + $item['timestamp'] = $editDate->getTimestamp(); + } + + return $item; + } } |