1 files changed, 344 insertions, 325 deletions
diff --git a/bridges/MoinMoinBridge.php b/bridges/MoinMoinBridge.php
index 1920c5a1..c8053587 100644
--- a/bridges/MoinMoinBridge.php
+++ b/bridges/MoinMoinBridge.php
@@ -1,327 +1,346 @@
 <?php
-class MoinMoinBridge extends BridgeAbstract {
-
-	const MAINTAINER = 'logmanoriginal';
-	const NAME = 'MoinMoin Bridge';
-	const URI = 'https://moinmo.in';
-	const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
-	const PARAMETERS = array(
-		array(
-			'source' => array(
-				'name' => 'Source',
-				'type' => 'text',
-				'required' => true,
-				'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
-				'exampleValue' => 'https://moinmo.in/MoinMoin'
-			),
-			'separator' => array(
-				'name' => 'Separator',
-				'type' => 'list',
-				'requied' => true,
-				'title' => 'Defines the separtor for splitting content into feeds',
-				'defaultValue' => 'h2',
-				'values' => array(
-					'Header (h1)' => 'h1',
-					'Header (h2)' => 'h2',
-					'Header (h3)' => 'h3',
-					'List element (li)' => 'li',
-					'Anchor (a)' => 'a'
-				)
-			),
-			'limit' => array(
-				'name' => 'Limit',
-				'type' => 'number',
-				'required' => false,
-				'title' => 'Number of items to return (from top)',
-				'defaultValue' => -1
-			),
-			'content' => array(
-				'name' => 'Content',
-				'type' => 'list',
-				'required' => false,
-				'title' => 'Defines how feed contents are build',
-				'defaultValue' => 'separator',
-				'values' => array(
-					'By separator' => 'separator',
-					'Follow link (only for anchor)' => 'follow',
-					'None' => 'none'
-				)
-			)
-		)
-	);
-
-	private $title = '';
-
-	public function collectData(){
-		/* MoinMoin uses a rather unpleasent representation of HTML. Instead of
-		 * using tags like <article/>, <navigation/>, <header/>, etc... it uses
-		 * <div/>, <span/> and <p/>. Also each line is literaly identified via
-		 * IDs. The only way to distinguish content is via headers, though not
-		 * in all cases.
-		 *
-		 * Example (indented for the sake of readability):
-		 * ...
-		 * <span class="anchor" id="line-1"></span>
-		 * <span class="anchor" id="line-2"></span>
-		 * <span class="anchor" id="line-3"></span>
-		 * <span class="anchor" id="line-4"></span>
-		 * <span class="anchor" id="line-5"></span>
-		 * <span class="anchor" id="line-6"></span>
-		 * <span class="anchor" id="line-7"></span>
-		 * <span class="anchor" id="line-8"></span>
-		 * <span class="anchor" id="line-9"></span>
-		 *   <p class="line867">MoinMoin is a Wiki software implemented in
-		 *     <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
-		 *   and distributed as Free Software under
-		 *     <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
-		 * ...
-		 */
-		$html = getSimpleHTMLDOM($this->getInput('source'));
-
-		// Some anchors link to local sites or local IDs (both don't work well
-		// in feeds)
-		$html = $this->fixAnchors($html);
-
-		$this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
-
-		// Here we focus on simple author and timestamp information from the given
-		// page. Later we update this information in case the anchor is followed.
-		$author = $this->findAuthor($html);
-		$timestamp = $this->findTimestamp($html);
-
-		$sections = $this->splitSections($html);
-
-		foreach($sections as $section) {
-			$item = array();
-
-			$item['uri'] = $this->findSectionAnchor($section[0]);
-
-			switch($this->getInput('content')) {
-				case 'none': // Do not return any content
-					break;
-				case 'follow': // Follow the anchor
-					// We can only follow anchors (use default otherwise)
-					if($this->getInput('separator') === 'a') {
-						$content = $this->followAnchor($item['uri']);
-
-						// Return only actual content
-						$item['content'] = $content->find('div#page', 0)->innertext;
-
-						// Each page could have its own author and timestamp
-						$author = $this->findAuthor($content);
-						$timestamp = $this->findTimestamp($content);
-
-						break;
-					}
-					// fall-through
-				case 'separator':
-				default: // Use contents from the current page
-					$item['content'] = $this->cleanArticle($section[2]);
-			}
-
-			if(!is_null($author)) $item['author'] = $author;
-			if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
-			$item['title'] = strip_tags($section[1]);
-
-			// Skip items with empty title
-			if(empty(trim($item['title']))) {
-				continue;
-			}
-
-			$this->items[] = $item;
-
-			if($this->getInput('limit') > 0
-			&& count($this->items) >= $this->getInput('limit')) {
-				break;
-			}
-		}
-	}
-
-	public function getName(){
-		return $this->title ?: parent::getName();
-	}
-
-	public function getURI(){
-		return $this->getInput('source') ?: parent::getURI();
-	}
-
-	/**
-	 * Splits the html into sections.
-	 *
-	 * Returns an array with one element per section. Each element consists of:
-	 * [0] The entire section
-	 * [1] The section title
-	 * [2] The section content
-	 */
-	private function splitSections($html){
-		$content = $html->find('div#page', 0)->innertext
-			or returnServerError('Unable to find <div id="page"/>!');
-
-		$sections = array();
-
-		$regex = implode(
-			'',
-			array(
-				"\<{$this->getInput('separator')}.+?(?=\>)\>",
-				"(.+?)(?=\<\/{$this->getInput('separator')}\>)",
-				"\<\/{$this->getInput('separator')}\>",
-				"(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
-			)
-		);
-
-		preg_match_all(
-			'/' . $regex . '/m',
-			$content,
-			$sections,
-			PREG_SET_ORDER
-		);
-
-		// Some pages don't use headers, return page as one feed
-		if(count($sections) === 0) {
-			return array(
-				array(
-					$content,
-					$html->find('title', 0)->innertext,
-					$content
-				)
-			);
-		}
-
-		return $sections;
-	}
-
-	/**
-	 * Returns the anchor for a given section
-	 */
-	private function findSectionAnchor($section){
-		$html = str_get_html($section);
-
-		// For IDs
-		$anchor = $html->find($this->getInput('separator') . '[id=]', 0);
-		if(!is_null($anchor)) {
-			return $this->getInput('source') . '#' . $anchor->id;
-		}
-
-		// For actual anchors
-		$anchor = $html->find($this->getInput('separator') . '[href=]', 0);
-		if(!is_null($anchor)) {
-			return $anchor->href;
-		}
-
-		// Nothing found
-		return $this->getInput('source');
-	}
-
-	/**
-	 * Returns the author
-	 *
-	 * Notice: Some pages don't provide author information
-	 */
-	private function findAuthor($html){
-		/* Example:
-		 * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
-		 * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
-		 * [178.162.199.143]">hosted-by</span>)</p>
-		*/
-		$pageinfo = $html->find('[id="pageinfo"]', 0);
-
-		if(is_null($pageinfo)) {
-			return null;
-		} else {
-			$author = $pageinfo->find('[title=]', 0);
-			if(is_null($author)) {
-				return null;
-			} else {
-				return trim(explode('@', $author->title)[0]);
-			}
-		}
-	}
-
-	/**
-	 * Returns the time of last edit
-	 *
-	 * Notice: Some pages don't provide this information
-	 */
-	private function findTimestamp($html){
-		// See example of findAuthor()
-		$pageinfo = $html->find('[id="pageinfo"]', 0);
-
-		if(is_null($pageinfo)) {
-			return null;
-		} else {
-			$timestamp = $pageinfo->innertext;
-			$matches = array();
-			preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
-			return strtotime($matches[1]);
-		}
-	}
-
-	/**
-	 * Returns the original HTML with all anchors fixed (makes relative anchors
-	 * absolute)
-	 */
-	private function fixAnchors($html, $source = null){
-
-		$source = $source ?: $this->getURI();
-
-		foreach($html->find('a') as $anchor) {
-			switch(substr($anchor->href, 0, 1)) {
-				case 'h': // http or https, no actions required
-					break;
-				case '/': // some relative path
-					$anchor->href = $this->findDomain($source) . $anchor->href;
-					break;
-				case '#': // it's an ID
-				default: // probably something like ? or &, skip empty ones
-					if(!isset($anchor->href))
-						break;
-					$anchor->href = $source . $anchor->href;
-			}
-		}
-
-		return $html;
-	}
-
-	/**
-	 * Loads the full article of a given anchor (if the anchor is from the same
-	 * wiki domain)
-	 */
-	private function followAnchor($anchor){
-		if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)) {
-			return null;
-		}
-
-		$html = getSimpleHTMLDOMCached($anchor);
-		if(!$html) { // Cannot load article
-			return null;
-		}
-
-		return $this->fixAnchors($html, $anchor);
-	}
-
-	/**
-	 * Finds the domain for a given URI
-	 */
-	private function findDomain($uri){
-		$matches = array();
-		preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
-		return $matches[1];
-	}
-
-	/* This function is a copy from CNETBridge */
-	private function stripWithDelimiters($string, $start, $end){
-		while(strpos($string, $start) !== false) {
-			$section_to_remove = substr($string, strpos($string, $start));
-			$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
-			$string = str_replace($section_to_remove, '', $string);
-		}
-
-		return $string;
-	}
-
-	/* This function is based on CNETBridge */
-	private function cleanArticle($article_html){
-		$article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
-		return $article_html;
-	}
+
+class MoinMoinBridge extends BridgeAbstract
+{
+    const MAINTAINER = 'logmanoriginal';
+    const NAME = 'MoinMoin Bridge';
+    const URI = 'https://moinmo.in';
+    const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
+    const PARAMETERS = [
+        [
+            'source' => [
+                'name' => 'Source',
+                'type' => 'text',
+                'required' => true,
+                'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
+                'exampleValue' => 'https://moinmo.in/MoinMoin'
+            ],
+            'separator' => [
+                'name' => 'Separator',
+                'type' => 'list',
+                'requied' => true,
+                'title' => 'Defines the separtor for splitting content into feeds',
+                'defaultValue' => 'h2',
+                'values' => [
+                    'Header (h1)' => 'h1',
+                    'Header (h2)' => 'h2',
+                    'Header (h3)' => 'h3',
+                    'List element (li)' => 'li',
+                    'Anchor (a)' => 'a'
+                ]
+            ],
+            'limit' => [
+                'name' => 'Limit',
+                'type' => 'number',
+                'required' => false,
+                'title' => 'Number of items to return (from top)',
+                'defaultValue' => -1
+            ],
+            'content' => [
+                'name' => 'Content',
+                'type' => 'list',
+                'required' => false,
+                'title' => 'Defines how feed contents are build',
+                'defaultValue' => 'separator',
+                'values' => [
+                    'By separator' => 'separator',
+                    'Follow link (only for anchor)' => 'follow',
+                    'None' => 'none'
+                ]
+            ]
+        ]
+    ];
+
+    private $title = '';
+
+    public function collectData()
+    {
+        /* MoinMoin uses a rather unpleasent representation of HTML. Instead of
+         * using tags like <article/>, <navigation/>, <header/>, etc... it uses
+         * <div/>, <span/> and <p/>. Also each line is literaly identified via
+         * IDs. The only way to distinguish content is via headers, though not
+         * in all cases.
+         *
+         * Example (indented for the sake of readability):
+         * ...
+         * <span class="anchor" id="line-1"></span>
+         * <span class="anchor" id="line-2"></span>
+         * <span class="anchor" id="line-3"></span>
+         * <span class="anchor" id="line-4"></span>
+         * <span class="anchor" id="line-5"></span>
+         * <span class="anchor" id="line-6"></span>
+         * <span class="anchor" id="line-7"></span>
+         * <span class="anchor" id="line-8"></span>
+         * <span class="anchor" id="line-9"></span>
+         *   <p class="line867">MoinMoin is a Wiki software implemented in
+         *     <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
+         *   and distributed as Free Software under
+         *     <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
+         * ...
+         */
+        $html = getSimpleHTMLDOM($this->getInput('source'));
+
+        // Some anchors link to local sites or local IDs (both don't work well
+        // in feeds)
+        $html = $this->fixAnchors($html);
+
+        $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
+
+        // Here we focus on simple author and timestamp information from the given
+        // page. Later we update this information in case the anchor is followed.
+        $author = $this->findAuthor($html);
+        $timestamp = $this->findTimestamp($html);
+
+        $sections = $this->splitSections($html);
+
+        foreach ($sections as $section) {
+            $item = [];
+
+            $item['uri'] = $this->findSectionAnchor($section[0]);
+
+            switch ($this->getInput('content')) {
+                case 'none': // Do not return any content
+                    break;
+                case 'follow': // Follow the anchor
+                    // We can only follow anchors (use default otherwise)
+                    if ($this->getInput('separator') === 'a') {
+                        $content = $this->followAnchor($item['uri']);
+
+                        // Return only actual content
+                        $item['content'] = $content->find('div#page', 0)->innertext;
+
+                        // Each page could have its own author and timestamp
+                        $author = $this->findAuthor($content);
+                        $timestamp = $this->findTimestamp($content);
+
+                        break;
+                    }
+                    // fall-through
+                case 'separator':
+                default: // Use contents from the current page
+                    $item['content'] = $this->cleanArticle($section[2]);
+            }
+
+            if (!is_null($author)) {
+                $item['author'] = $author;
+            }
+            if (!is_null($timestamp)) {
+                $item['timestamp'] = $timestamp;
+            }
+            $item['title'] = strip_tags($section[1]);
+
+            // Skip items with empty title
+            if (empty(trim($item['title']))) {
+                continue;
+            }
+
+            $this->items[] = $item;
+
+            if (
+                $this->getInput('limit') > 0
+                && count($this->items) >= $this->getInput('limit')
+            ) {
+                break;
+            }
+        }
+    }
+
+    public function getName()
+    {
+        return $this->title ?: parent::getName();
+    }
+
+    public function getURI()
+    {
+        return $this->getInput('source') ?: parent::getURI();
+    }
+
+    /**
+     * Splits the html into sections.
+     *
+     * Returns an array with one element per section. Each element consists of:
+     * [0] The entire section
+     * [1] The section title
+     * [2] The section content
+     */
+    private function splitSections($html)
+    {
+        $content = $html->find('div#page', 0)->innertext
+            or returnServerError('Unable to find <div id="page"/>!');
+
+        $sections = [];
+
+        $regex = implode(
+            '',
+            [
+                "\<{$this->getInput('separator')}.+?(?=\>)\>",
+                "(.+?)(?=\<\/{$this->getInput('separator')}\>)",
+                "\<\/{$this->getInput('separator')}\>",
+                "(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
+            ]
+        );
+
+        preg_match_all(
+            '/' . $regex . '/m',
+            $content,
+            $sections,
+            PREG_SET_ORDER
+        );
+
+        // Some pages don't use headers, return page as one feed
+        if (count($sections) === 0) {
+            return [
+                [
+                    $content,
+                    $html->find('title', 0)->innertext,
+                    $content
+                ]
+            ];
+        }
+
+        return $sections;
+    }
+
+    /**
+     * Returns the anchor for a given section
+     */
+    private function findSectionAnchor($section)
+    {
+        $html = str_get_html($section);
+
+        // For IDs
+        $anchor = $html->find($this->getInput('separator') . '[id=]', 0);
+        if (!is_null($anchor)) {
+            return $this->getInput('source') . '#' . $anchor->id;
+        }
+
+        // For actual anchors
+        $anchor = $html->find($this->getInput('separator') . '[href=]', 0);
+        if (!is_null($anchor)) {
+            return $anchor->href;
+        }
+
+        // Nothing found
+        return $this->getInput('source');
+    }
+
+    /**
+     * Returns the author
+     *
+     * Notice: Some pages don't provide author information
+     */
+    private function findAuthor($html)
+    {
+        /* Example:
+         * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
+         * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
+         * [178.162.199.143]">hosted-by</span>)</p>
+        */
+        $pageinfo = $html->find('[id="pageinfo"]', 0);
+
+        if (is_null($pageinfo)) {
+            return null;
+        } else {
+            $author = $pageinfo->find('[title=]', 0);
+            if (is_null($author)) {
+                return null;
+            } else {
+                return trim(explode('@', $author->title)[0]);
+            }
+        }
+    }
+
+    /**
+     * Returns the time of last edit
+     *
+     * Notice: Some pages don't provide this information
+     */
+    private function findTimestamp($html)
+    {
+        // See example of findAuthor()
+        $pageinfo = $html->find('[id="pageinfo"]', 0);
+
+        if (is_null($pageinfo)) {
+            return null;
+        } else {
+            $timestamp = $pageinfo->innertext;
+            $matches = [];
+            preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
+            return strtotime($matches[1]);
+        }
+    }
+
+    /**
+     * Returns the original HTML with all anchors fixed (makes relative anchors
+     * absolute)
+     */
+    private function fixAnchors($html, $source = null)
+    {
+        $source = $source ?: $this->getURI();
+
+        foreach ($html->find('a') as $anchor) {
+            switch (substr($anchor->href, 0, 1)) {
+                case 'h': // http or https, no actions required
+                    break;
+                case '/': // some relative path
+                    $anchor->href = $this->findDomain($source) . $anchor->href;
+                    break;
+                case '#': // it's an ID
+                default: // probably something like ? or &, skip empty ones
+                    if (!isset($anchor->href)) {
+                        break;
+                    }
+                    $anchor->href = $source . $anchor->href;
+            }
+        }
+
+        return $html;
+    }
+
+    /**
+     * Loads the full article of a given anchor (if the anchor is from the same
+     * wiki domain)
+     */
+    private function followAnchor($anchor)
+    {
+        if (strrpos($anchor, $this->findDomain($this->getInput('source')) === false)) {
+            return null;
+        }
+
+        $html = getSimpleHTMLDOMCached($anchor);
+        if (!$html) { // Cannot load article
+            return null;
+        }
+
+        return $this->fixAnchors($html, $anchor);
+    }
+
+    /**
+     * Finds the domain for a given URI
+     */
+    private function findDomain($uri)
+    {
+        $matches = [];
+        preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
+        return $matches[1];
+    }
+
+    /* This function is a copy from CNETBridge */
+    private function stripWithDelimiters($string, $start, $end)
+    {
+        while (strpos($string, $start) !== false) {
+            $section_to_remove = substr($string, strpos($string, $start));
+            $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
+            $string = str_replace($section_to_remove, '', $string);
+        }
+
+        return $string;
+    }
+
+    /* This function is based on CNETBridge */
+    private function cleanArticle($article_html)
+    {
+        $article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
+        return $article_html;
+    }
 }