diff options
Diffstat (limited to 'bridges/LWNprevBridge.php')
-rw-r--r-- | bridges/LWNprevBridge.php | 534 |
1 files changed, 273 insertions, 261 deletions
diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php index 40b1b129..358f841a 100644 --- a/bridges/LWNprevBridge.php +++ b/bridges/LWNprevBridge.php @@ -1,266 +1,278 @@ <?php -class LWNprevBridge extends BridgeAbstract{ - const MAINTAINER = 'Pierre Mazière'; - const NAME = 'LWN Free Weekly Edition'; - const URI = 'https://lwn.net/'; - const CACHE_TIMEOUT = 604800; // 1 week - const DESCRIPTION = 'LWN Free Weekly Edition available one week late'; - - private $editionTimeStamp; - - public function getURI(){ - return self::URI . 'free/bigpage'; - } - - private function jumpToNextTag(&$node){ - while($node && $node->nodeType === XML_TEXT_NODE) { - $nextNode = $node->nextSibling; - if(!$nextNode) { - break; - } - $node = $nextNode; - } - } - - private function jumpToPreviousTag(&$node){ - while($node && $node->nodeType === XML_TEXT_NODE) { - $previousNode = $node->previousSibling; - if(!$previousNode) { - break; - } - $node = $previousNode; - } - } - - public function collectData(){ - // Because the LWN page is written in loose HTML and not XHTML, - // Simple HTML Dom is not accurate enough for the job - $content = getContents($this->getURI()); - - $contents = explode('<b>Page editor</b>', $content); - - foreach($contents as $content) { - if(strpos($content, '<html>') === false) { - $content = <<<EOD + +class LWNprevBridge extends BridgeAbstract +{ + const MAINTAINER = 'Pierre Mazière'; + const NAME = 'LWN Free Weekly Edition'; + const URI = 'https://lwn.net/'; + const CACHE_TIMEOUT = 604800; // 1 week + const DESCRIPTION = 'LWN Free Weekly Edition available one week late'; + + private $editionTimeStamp; + + public function getURI() + { + return self::URI . 'free/bigpage'; + } + + private function jumpToNextTag(&$node) + { + while ($node && $node->nodeType === XML_TEXT_NODE) { + $nextNode = $node->nextSibling; + if (!$nextNode) { + break; + } + $node = $nextNode; + } + } + + private function jumpToPreviousTag(&$node) + { + while ($node && $node->nodeType === XML_TEXT_NODE) { + $previousNode = $node->previousSibling; + if (!$previousNode) { + break; + } + $node = $previousNode; + } + } + + public function collectData() + { + // Because the LWN page is written in loose HTML and not XHTML, + // Simple HTML Dom is not accurate enough for the job + $content = getContents($this->getURI()); + + $contents = explode('<b>Page editor</b>', $content); + + foreach ($contents as $content) { + if (strpos($content, '<html>') === false) { + $content = <<<EOD <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html><head><title>LWN</title></head><body>{$content}</body></html> EOD; - } else { - $content = $content . '</body></html>'; - } - - libxml_use_internal_errors(true); - $html = new DOMDocument(); - $html->loadHTML($content); - libxml_clear_errors(); - - $edition = $html->getElementsByTagName('h1'); - if($edition->length !== 0) { - $text = $edition->item(0)->textContent; - $this->editionTimeStamp = strtotime( - substr($text, strpos($text, 'for ') + strlen('for ')) - ); - } - - if(strpos($content, 'Cat1HL') === false) { - $items = $this->getFeatureContents($html); - } elseif(strpos($content, 'Cat3HL') === false) { - $items = $this->getBriefItems($html); - } else { - $items = $this->getAnnouncements($html); - } - - $this->items = array_merge($this->items, $items); - } - } - - private function getArticleContent(&$title){ - $link = $title->firstChild; - $this->jumpToNextTag($link); - $item['uri'] = self::URI; - if($link->nodeName === 'a') { - $item['uri'] .= $link->getAttribute('href'); - } - - $item['timestamp'] = $this->editionTimeStamp; - - $node = $title; - $content = ''; - $contentEnd = false; - while(!$contentEnd) { - $node = $node->nextSibling; - if(!$node || ( - $node->nodeType !== XML_TEXT_NODE && - $node->nodeName === 'h2' || ( - !is_null($node->attributes) && - !is_null($class = $node->attributes->getNamedItem('class')) && - in_array($class->nodeValue, array('Cat1HL','Cat2HL')) - ) - ) - ) { - $contentEnd = true; - } else { - $content .= $node->C14N(); - } - } - $item['content'] = $content; - return $item; - } - - private function getFeatureContents(&$html){ - $items = array(); - foreach($html->getElementsByTagName('h3') as $title) { - if($title->getAttribute('class') !== 'SummaryHL') { - continue; - } - - $item = array(); - - $author = $title->nextSibling; - $this->jumpToNextTag($author); - if($author->getAttribute('class') === 'FeatureByline') { - $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent; - } else { - continue; - } - - $item['title'] = $title->textContent; - - $items[] = array_merge($item, $this->getArticleContent($title)); - } - return $items; - } - - private function getItemPrefix(&$cat, &$cats){ - $cat1 = ''; - $cat2 = ''; - $cat3 = ''; - switch($cat->getAttribute('class')) { - case 'Cat3HL': - $cat3 = $cat->textContent; - $cat = $cat->previousSibling; - $this->jumpToPreviousTag($cat); - $cats[2] = $cat3; - if($cat->getAttribute('class') !== 'Cat2HL') { - break; - } - // fall-through? Looks like a bug - case 'Cat2HL': - $cat2 = $cat->textContent; - $cat = $cat->previousSibling; - $this->jumpToPreviousTag($cat); - $cats[1] = $cat2; - if(empty($cat3)) { - $cats[2] = ''; - } - if($cat->getAttribute('class') !== 'Cat1HL') { - break; - } - // fall-through? Looks like a bug - case 'Cat1HL': - $cat1 = $cat->textContent; - $cats[0] = $cat1; - if(empty($cat3)) { - $cats[2] = ''; - } - if(empty($cat2)) { - $cats[1] = ''; - } - break; - default: - break; - } - - $prefix = ''; - if(!empty($cats[0])) { - $prefix .= '[' . $cats[0] . ($cats[1] ? '/' . $cats[1] : '') . '] '; - } - return $prefix; - } - - private function getAnnouncements(&$html){ - $items = array(); - $cats = array('','',''); - - foreach($html->getElementsByTagName('p') as $newsletters) { - if($newsletters->getAttribute('class') !== 'Cat3HL') { - continue; - } - - $item = array(); - - $item['uri'] = self::URI . '#' . count($items); - - $item['timestamp'] = $this->editionTimeStamp; - - $item['author'] = 'LWN'; - - $cat = $newsletters->previousSibling; - $this->jumpToPreviousTag($cat); - $prefix = $this->getItemPrefix($cat, $cats); - $item['title'] = $prefix . ' ' . $newsletters->textContent; - - $node = $newsletters; - $content = ''; - $contentEnd = false; - while(!$contentEnd) { - $node = $node->nextSibling; - if(!$node || ( - $node->nodeType !== XML_TEXT_NODE && ( - !is_null($node->attributes) && - !is_null($class = $node->attributes->getNamedItem('class')) && - in_array($class->nodeValue, array('Cat1HL','Cat2HL','Cat3HL')) - ) - ) - ) { - $contentEnd = true; - } else { - $content .= $node->C14N(); - } - } - $item['content'] = $content; - $items[] = $item; - } - - foreach($html->getElementsByTagName('h2') as $title) { - if($title->getAttribute('class') !== 'SummaryHL') { - continue; - } - - $item = array(); - - $cat = $title->previousSibling; - $this->jumpToPreviousTag($cat); - $cat = $cat->previousSibling; - $this->jumpToPreviousTag($cat); - $prefix = $this->getItemPrefix($cat, $cats); - $item['title'] = $prefix . ' ' . $title->textContent; - $items[] = array_merge($item, $this->getArticleContent($title)); - } - - return $items; - } - - private function getBriefItems(&$html){ - $items = array(); - $cats = array('','',''); - foreach($html->getElementsByTagName('h2') as $title) { - if($title->getAttribute('class') !== 'SummaryHL') { - continue; - } - - $item = array(); - - $cat = $title->previousSibling; - $this->jumpToPreviousTag($cat); - $cat = $cat->previousSibling; - $this->jumpToPreviousTag($cat); - $prefix = $this->getItemPrefix($cat, $cats); - $item['title'] = $prefix . ' ' . $title->textContent; - $items[] = array_merge($item, $this->getArticleContent($title)); - } - - return $items; - } + } else { + $content = $content . '</body></html>'; + } + + libxml_use_internal_errors(true); + $html = new DOMDocument(); + $html->loadHTML($content); + libxml_clear_errors(); + + $edition = $html->getElementsByTagName('h1'); + if ($edition->length !== 0) { + $text = $edition->item(0)->textContent; + $this->editionTimeStamp = strtotime( + substr($text, strpos($text, 'for ') + strlen('for ')) + ); + } + + if (strpos($content, 'Cat1HL') === false) { + $items = $this->getFeatureContents($html); + } elseif (strpos($content, 'Cat3HL') === false) { + $items = $this->getBriefItems($html); + } else { + $items = $this->getAnnouncements($html); + } + + $this->items = array_merge($this->items, $items); + } + } + + private function getArticleContent(&$title) + { + $link = $title->firstChild; + $this->jumpToNextTag($link); + $item['uri'] = self::URI; + if ($link->nodeName === 'a') { + $item['uri'] .= $link->getAttribute('href'); + } + + $item['timestamp'] = $this->editionTimeStamp; + + $node = $title; + $content = ''; + $contentEnd = false; + while (!$contentEnd) { + $node = $node->nextSibling; + if ( + !$node || ( + $node->nodeType !== XML_TEXT_NODE && + $node->nodeName === 'h2' || ( + !is_null($node->attributes) && + !is_null($class = $node->attributes->getNamedItem('class')) && + in_array($class->nodeValue, ['Cat1HL','Cat2HL']) + ) + ) + ) { + $contentEnd = true; + } else { + $content .= $node->C14N(); + } + } + $item['content'] = $content; + return $item; + } + + private function getFeatureContents(&$html) + { + $items = []; + foreach ($html->getElementsByTagName('h3') as $title) { + if ($title->getAttribute('class') !== 'SummaryHL') { + continue; + } + + $item = []; + + $author = $title->nextSibling; + $this->jumpToNextTag($author); + if ($author->getAttribute('class') === 'FeatureByline') { + $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent; + } else { + continue; + } + + $item['title'] = $title->textContent; + + $items[] = array_merge($item, $this->getArticleContent($title)); + } + return $items; + } + + private function getItemPrefix(&$cat, &$cats) + { + $cat1 = ''; + $cat2 = ''; + $cat3 = ''; + switch ($cat->getAttribute('class')) { + case 'Cat3HL': + $cat3 = $cat->textContent; + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $cats[2] = $cat3; + if ($cat->getAttribute('class') !== 'Cat2HL') { + break; + } + // fall-through? Looks like a bug + case 'Cat2HL': + $cat2 = $cat->textContent; + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $cats[1] = $cat2; + if (empty($cat3)) { + $cats[2] = ''; + } + if ($cat->getAttribute('class') !== 'Cat1HL') { + break; + } + // fall-through? Looks like a bug + case 'Cat1HL': + $cat1 = $cat->textContent; + $cats[0] = $cat1; + if (empty($cat3)) { + $cats[2] = ''; + } + if (empty($cat2)) { + $cats[1] = ''; + } + break; + default: + break; + } + + $prefix = ''; + if (!empty($cats[0])) { + $prefix .= '[' . $cats[0] . ($cats[1] ? '/' . $cats[1] : '') . '] '; + } + return $prefix; + } + + private function getAnnouncements(&$html) + { + $items = []; + $cats = ['','','']; + + foreach ($html->getElementsByTagName('p') as $newsletters) { + if ($newsletters->getAttribute('class') !== 'Cat3HL') { + continue; + } + + $item = []; + + $item['uri'] = self::URI . '#' . count($items); + + $item['timestamp'] = $this->editionTimeStamp; + + $item['author'] = 'LWN'; + + $cat = $newsletters->previousSibling; + $this->jumpToPreviousTag($cat); + $prefix = $this->getItemPrefix($cat, $cats); + $item['title'] = $prefix . ' ' . $newsletters->textContent; + + $node = $newsletters; + $content = ''; + $contentEnd = false; + while (!$contentEnd) { + $node = $node->nextSibling; + if ( + !$node || ( + $node->nodeType !== XML_TEXT_NODE && ( + !is_null($node->attributes) && + !is_null($class = $node->attributes->getNamedItem('class')) && + in_array($class->nodeValue, ['Cat1HL','Cat2HL','Cat3HL']) + ) + ) + ) { + $contentEnd = true; + } else { + $content .= $node->C14N(); + } + } + $item['content'] = $content; + $items[] = $item; + } + + foreach ($html->getElementsByTagName('h2') as $title) { + if ($title->getAttribute('class') !== 'SummaryHL') { + continue; + } + + $item = []; + + $cat = $title->previousSibling; + $this->jumpToPreviousTag($cat); + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $prefix = $this->getItemPrefix($cat, $cats); + $item['title'] = $prefix . ' ' . $title->textContent; + $items[] = array_merge($item, $this->getArticleContent($title)); + } + + return $items; + } + + private function getBriefItems(&$html) + { + $items = []; + $cats = ['','','']; + foreach ($html->getElementsByTagName('h2') as $title) { + if ($title->getAttribute('class') !== 'SummaryHL') { + continue; + } + + $item = []; + + $cat = $title->previousSibling; + $this->jumpToPreviousTag($cat); + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $prefix = $this->getItemPrefix($cat, $cats); + $item['title'] = $prefix . ' ' . $title->textContent; + $items[] = array_merge($item, $this->getArticleContent($title)); + } + + return $items; + } } -?> |