diff options
Diffstat (limited to 'bridges/MoinMoinBridge.php')
-rw-r--r-- | bridges/MoinMoinBridge.php | 669 |
1 files changed, 344 insertions, 325 deletions
diff --git a/bridges/MoinMoinBridge.php b/bridges/MoinMoinBridge.php index 1920c5a1..c8053587 100644 --- a/bridges/MoinMoinBridge.php +++ b/bridges/MoinMoinBridge.php @@ -1,327 +1,346 @@ <?php -class MoinMoinBridge extends BridgeAbstract { - - const MAINTAINER = 'logmanoriginal'; - const NAME = 'MoinMoin Bridge'; - const URI = 'https://moinmo.in'; - const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki'; - const PARAMETERS = array( - array( - 'source' => array( - 'name' => 'Source', - 'type' => 'text', - 'required' => true, - 'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)', - 'exampleValue' => 'https://moinmo.in/MoinMoin' - ), - 'separator' => array( - 'name' => 'Separator', - 'type' => 'list', - 'requied' => true, - 'title' => 'Defines the separtor for splitting content into feeds', - 'defaultValue' => 'h2', - 'values' => array( - 'Header (h1)' => 'h1', - 'Header (h2)' => 'h2', - 'Header (h3)' => 'h3', - 'List element (li)' => 'li', - 'Anchor (a)' => 'a' - ) - ), - 'limit' => array( - 'name' => 'Limit', - 'type' => 'number', - 'required' => false, - 'title' => 'Number of items to return (from top)', - 'defaultValue' => -1 - ), - 'content' => array( - 'name' => 'Content', - 'type' => 'list', - 'required' => false, - 'title' => 'Defines how feed contents are build', - 'defaultValue' => 'separator', - 'values' => array( - 'By separator' => 'separator', - 'Follow link (only for anchor)' => 'follow', - 'None' => 'none' - ) - ) - ) - ); - - private $title = ''; - - public function collectData(){ - /* MoinMoin uses a rather unpleasent representation of HTML. Instead of - * using tags like <article/>, <navigation/>, <header/>, etc... it uses - * <div/>, <span/> and <p/>. Also each line is literaly identified via - * IDs. The only way to distinguish content is via headers, though not - * in all cases. - * - * Example (indented for the sake of readability): - * ... - * <span class="anchor" id="line-1"></span> - * <span class="anchor" id="line-2"></span> - * <span class="anchor" id="line-3"></span> - * <span class="anchor" id="line-4"></span> - * <span class="anchor" id="line-5"></span> - * <span class="anchor" id="line-6"></span> - * <span class="anchor" id="line-7"></span> - * <span class="anchor" id="line-8"></span> - * <span class="anchor" id="line-9"></span> - * <p class="line867">MoinMoin is a Wiki software implemented in - * <a class="interwiki" href="/Python" title="MoinMoin">Python</a> - * and distributed as Free Software under - * <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>. - * ... - */ - $html = getSimpleHTMLDOM($this->getInput('source')); - - // Some anchors link to local sites or local IDs (both don't work well - // in feeds) - $html = $this->fixAnchors($html); - - $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME; - - // Here we focus on simple author and timestamp information from the given - // page. Later we update this information in case the anchor is followed. - $author = $this->findAuthor($html); - $timestamp = $this->findTimestamp($html); - - $sections = $this->splitSections($html); - - foreach($sections as $section) { - $item = array(); - - $item['uri'] = $this->findSectionAnchor($section[0]); - - switch($this->getInput('content')) { - case 'none': // Do not return any content - break; - case 'follow': // Follow the anchor - // We can only follow anchors (use default otherwise) - if($this->getInput('separator') === 'a') { - $content = $this->followAnchor($item['uri']); - - // Return only actual content - $item['content'] = $content->find('div#page', 0)->innertext; - - // Each page could have its own author and timestamp - $author = $this->findAuthor($content); - $timestamp = $this->findTimestamp($content); - - break; - } - // fall-through - case 'separator': - default: // Use contents from the current page - $item['content'] = $this->cleanArticle($section[2]); - } - - if(!is_null($author)) $item['author'] = $author; - if(!is_null($timestamp)) $item['timestamp'] = $timestamp; - $item['title'] = strip_tags($section[1]); - - // Skip items with empty title - if(empty(trim($item['title']))) { - continue; - } - - $this->items[] = $item; - - if($this->getInput('limit') > 0 - && count($this->items) >= $this->getInput('limit')) { - break; - } - } - } - - public function getName(){ - return $this->title ?: parent::getName(); - } - - public function getURI(){ - return $this->getInput('source') ?: parent::getURI(); - } - - /** - * Splits the html into sections. - * - * Returns an array with one element per section. Each element consists of: - * [0] The entire section - * [1] The section title - * [2] The section content - */ - private function splitSections($html){ - $content = $html->find('div#page', 0)->innertext - or returnServerError('Unable to find <div id="page"/>!'); - - $sections = array(); - - $regex = implode( - '', - array( - "\<{$this->getInput('separator')}.+?(?=\>)\>", - "(.+?)(?=\<\/{$this->getInput('separator')}\>)", - "\<\/{$this->getInput('separator')}\>", - "(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}" - ) - ); - - preg_match_all( - '/' . $regex . '/m', - $content, - $sections, - PREG_SET_ORDER - ); - - // Some pages don't use headers, return page as one feed - if(count($sections) === 0) { - return array( - array( - $content, - $html->find('title', 0)->innertext, - $content - ) - ); - } - - return $sections; - } - - /** - * Returns the anchor for a given section - */ - private function findSectionAnchor($section){ - $html = str_get_html($section); - - // For IDs - $anchor = $html->find($this->getInput('separator') . '[id=]', 0); - if(!is_null($anchor)) { - return $this->getInput('source') . '#' . $anchor->id; - } - - // For actual anchors - $anchor = $html->find($this->getInput('separator') . '[href=]', 0); - if(!is_null($anchor)) { - return $anchor->href; - } - - // Nothing found - return $this->getInput('source'); - } - - /** - * Returns the author - * - * Notice: Some pages don't provide author information - */ - private function findAuthor($html){ - /* Example: - * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords - * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com - * [178.162.199.143]">hosted-by</span>)</p> - */ - $pageinfo = $html->find('[id="pageinfo"]', 0); - - if(is_null($pageinfo)) { - return null; - } else { - $author = $pageinfo->find('[title=]', 0); - if(is_null($author)) { - return null; - } else { - return trim(explode('@', $author->title)[0]); - } - } - } - - /** - * Returns the time of last edit - * - * Notice: Some pages don't provide this information - */ - private function findTimestamp($html){ - // See example of findAuthor() - $pageinfo = $html->find('[id="pageinfo"]', 0); - - if(is_null($pageinfo)) { - return null; - } else { - $timestamp = $pageinfo->innertext; - $matches = array(); - preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches); - return strtotime($matches[1]); - } - } - - /** - * Returns the original HTML with all anchors fixed (makes relative anchors - * absolute) - */ - private function fixAnchors($html, $source = null){ - - $source = $source ?: $this->getURI(); - - foreach($html->find('a') as $anchor) { - switch(substr($anchor->href, 0, 1)) { - case 'h': // http or https, no actions required - break; - case '/': // some relative path - $anchor->href = $this->findDomain($source) . $anchor->href; - break; - case '#': // it's an ID - default: // probably something like ? or &, skip empty ones - if(!isset($anchor->href)) - break; - $anchor->href = $source . $anchor->href; - } - } - - return $html; - } - - /** - * Loads the full article of a given anchor (if the anchor is from the same - * wiki domain) - */ - private function followAnchor($anchor){ - if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)) { - return null; - } - - $html = getSimpleHTMLDOMCached($anchor); - if(!$html) { // Cannot load article - return null; - } - - return $this->fixAnchors($html, $anchor); - } - - /** - * Finds the domain for a given URI - */ - private function findDomain($uri){ - $matches = array(); - preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches); - return $matches[1]; - } - - /* This function is a copy from CNETBridge */ - private function stripWithDelimiters($string, $start, $end){ - while(strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } - - return $string; - } - - /* This function is based on CNETBridge */ - private function cleanArticle($article_html){ - $article_html = $this->stripWithDelimiters($article_html, '<script', '</script>'); - return $article_html; - } + +class MoinMoinBridge extends BridgeAbstract +{ + const MAINTAINER = 'logmanoriginal'; + const NAME = 'MoinMoin Bridge'; + const URI = 'https://moinmo.in'; + const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki'; + const PARAMETERS = [ + [ + 'source' => [ + 'name' => 'Source', + 'type' => 'text', + 'required' => true, + 'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)', + 'exampleValue' => 'https://moinmo.in/MoinMoin' + ], + 'separator' => [ + 'name' => 'Separator', + 'type' => 'list', + 'requied' => true, + 'title' => 'Defines the separtor for splitting content into feeds', + 'defaultValue' => 'h2', + 'values' => [ + 'Header (h1)' => 'h1', + 'Header (h2)' => 'h2', + 'Header (h3)' => 'h3', + 'List element (li)' => 'li', + 'Anchor (a)' => 'a' + ] + ], + 'limit' => [ + 'name' => 'Limit', + 'type' => 'number', + 'required' => false, + 'title' => 'Number of items to return (from top)', + 'defaultValue' => -1 + ], + 'content' => [ + 'name' => 'Content', + 'type' => 'list', + 'required' => false, + 'title' => 'Defines how feed contents are build', + 'defaultValue' => 'separator', + 'values' => [ + 'By separator' => 'separator', + 'Follow link (only for anchor)' => 'follow', + 'None' => 'none' + ] + ] + ] + ]; + + private $title = ''; + + public function collectData() + { + /* MoinMoin uses a rather unpleasent representation of HTML. Instead of + * using tags like <article/>, <navigation/>, <header/>, etc... it uses + * <div/>, <span/> and <p/>. Also each line is literaly identified via + * IDs. The only way to distinguish content is via headers, though not + * in all cases. + * + * Example (indented for the sake of readability): + * ... + * <span class="anchor" id="line-1"></span> + * <span class="anchor" id="line-2"></span> + * <span class="anchor" id="line-3"></span> + * <span class="anchor" id="line-4"></span> + * <span class="anchor" id="line-5"></span> + * <span class="anchor" id="line-6"></span> + * <span class="anchor" id="line-7"></span> + * <span class="anchor" id="line-8"></span> + * <span class="anchor" id="line-9"></span> + * <p class="line867">MoinMoin is a Wiki software implemented in + * <a class="interwiki" href="/Python" title="MoinMoin">Python</a> + * and distributed as Free Software under + * <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>. + * ... + */ + $html = getSimpleHTMLDOM($this->getInput('source')); + + // Some anchors link to local sites or local IDs (both don't work well + // in feeds) + $html = $this->fixAnchors($html); + + $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME; + + // Here we focus on simple author and timestamp information from the given + // page. Later we update this information in case the anchor is followed. + $author = $this->findAuthor($html); + $timestamp = $this->findTimestamp($html); + + $sections = $this->splitSections($html); + + foreach ($sections as $section) { + $item = []; + + $item['uri'] = $this->findSectionAnchor($section[0]); + + switch ($this->getInput('content')) { + case 'none': // Do not return any content + break; + case 'follow': // Follow the anchor + // We can only follow anchors (use default otherwise) + if ($this->getInput('separator') === 'a') { + $content = $this->followAnchor($item['uri']); + + // Return only actual content + $item['content'] = $content->find('div#page', 0)->innertext; + + // Each page could have its own author and timestamp + $author = $this->findAuthor($content); + $timestamp = $this->findTimestamp($content); + + break; + } + // fall-through + case 'separator': + default: // Use contents from the current page + $item['content'] = $this->cleanArticle($section[2]); + } + + if (!is_null($author)) { + $item['author'] = $author; + } + if (!is_null($timestamp)) { + $item['timestamp'] = $timestamp; + } + $item['title'] = strip_tags($section[1]); + + // Skip items with empty title + if (empty(trim($item['title']))) { + continue; + } + + $this->items[] = $item; + + if ( + $this->getInput('limit') > 0 + && count($this->items) >= $this->getInput('limit') + ) { + break; + } + } + } + + public function getName() + { + return $this->title ?: parent::getName(); + } + + public function getURI() + { + return $this->getInput('source') ?: parent::getURI(); + } + + /** + * Splits the html into sections. + * + * Returns an array with one element per section. Each element consists of: + * [0] The entire section + * [1] The section title + * [2] The section content + */ + private function splitSections($html) + { + $content = $html->find('div#page', 0)->innertext + or returnServerError('Unable to find <div id="page"/>!'); + + $sections = []; + + $regex = implode( + '', + [ + "\<{$this->getInput('separator')}.+?(?=\>)\>", + "(.+?)(?=\<\/{$this->getInput('separator')}\>)", + "\<\/{$this->getInput('separator')}\>", + "(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}" + ] + ); + + preg_match_all( + '/' . $regex . '/m', + $content, + $sections, + PREG_SET_ORDER + ); + + // Some pages don't use headers, return page as one feed + if (count($sections) === 0) { + return [ + [ + $content, + $html->find('title', 0)->innertext, + $content + ] + ]; + } + + return $sections; + } + + /** + * Returns the anchor for a given section + */ + private function findSectionAnchor($section) + { + $html = str_get_html($section); + + // For IDs + $anchor = $html->find($this->getInput('separator') . '[id=]', 0); + if (!is_null($anchor)) { + return $this->getInput('source') . '#' . $anchor->id; + } + + // For actual anchors + $anchor = $html->find($this->getInput('separator') . '[href=]', 0); + if (!is_null($anchor)) { + return $anchor->href; + } + + // Nothing found + return $this->getInput('source'); + } + + /** + * Returns the author + * + * Notice: Some pages don't provide author information + */ + private function findAuthor($html) + { + /* Example: + * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords + * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com + * [178.162.199.143]">hosted-by</span>)</p> + */ + $pageinfo = $html->find('[id="pageinfo"]', 0); + + if (is_null($pageinfo)) { + return null; + } else { + $author = $pageinfo->find('[title=]', 0); + if (is_null($author)) { + return null; + } else { + return trim(explode('@', $author->title)[0]); + } + } + } + + /** + * Returns the time of last edit + * + * Notice: Some pages don't provide this information + */ + private function findTimestamp($html) + { + // See example of findAuthor() + $pageinfo = $html->find('[id="pageinfo"]', 0); + + if (is_null($pageinfo)) { + return null; + } else { + $timestamp = $pageinfo->innertext; + $matches = []; + preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches); + return strtotime($matches[1]); + } + } + + /** + * Returns the original HTML with all anchors fixed (makes relative anchors + * absolute) + */ + private function fixAnchors($html, $source = null) + { + $source = $source ?: $this->getURI(); + + foreach ($html->find('a') as $anchor) { + switch (substr($anchor->href, 0, 1)) { + case 'h': // http or https, no actions required + break; + case '/': // some relative path + $anchor->href = $this->findDomain($source) . $anchor->href; + break; + case '#': // it's an ID + default: // probably something like ? or &, skip empty ones + if (!isset($anchor->href)) { + break; + } + $anchor->href = $source . $anchor->href; + } + } + + return $html; + } + + /** + * Loads the full article of a given anchor (if the anchor is from the same + * wiki domain) + */ + private function followAnchor($anchor) + { + if (strrpos($anchor, $this->findDomain($this->getInput('source')) === false)) { + return null; + } + + $html = getSimpleHTMLDOMCached($anchor); + if (!$html) { // Cannot load article + return null; + } + + return $this->fixAnchors($html, $anchor); + } + + /** + * Finds the domain for a given URI + */ + private function findDomain($uri) + { + $matches = []; + preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches); + return $matches[1]; + } + + /* This function is a copy from CNETBridge */ + private function stripWithDelimiters($string, $start, $end) + { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } + + return $string; + } + + /* This function is based on CNETBridge */ + private function cleanArticle($article_html) + { + $article_html = $this->stripWithDelimiters($article_html, '<script', '</script>'); + return $article_html; + } } |