aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bridges/EconomistBridge.php25
1 files changed, 19 insertions, 6 deletions
diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php
index c30261a0..652c7ea6 100644
--- a/bridges/EconomistBridge.php
+++ b/bridges/EconomistBridge.php
@@ -95,23 +95,36 @@ class EconomistBridge extends FeedExpander {
protected function parseItem($feedItem){
$item = parent::parseItem($feedItem);
-
$article = getSimpleHTMLDOM($item['uri']);
// before the article can be added, it needs to be cleaned up, thus, the extra function
- $item['content'] = $this->cleanContent($article);
+ // We also need to distinguish between old style and new style articles
+ if ($article->find('article', 0)->getAttribute('data-test-id') == 'Article') {
+ $contentNode = 'div.layout-article-body';
+ $imgNode = 'div.article__lead-image';
+ $categoryNode = 'span.article__subheadline';
+ } else {
+ $contentNode = 'div.e4sqmp1';
+ $imgNode = 'figure.e3y6nua0';
+ $categoryNode = 'span.ern1uyf0';
+ }
+
+ $item['content'] = $this->cleanContent($article, $contentNode);
// only the article lead image is retained if it's there
- if (!is_null($article->find('div.article__lead-image', 0))) {
- $item['enclosures'][] = $article->find('div.article__lead-image', 0)->find('img', 0)->getAttribute('src');
+ if (!is_null($article->find($imgNode, 0))) {
+ $item['enclosures'][] = $article->find($imgNode, 0)->find('img', 0)->getAttribute('src');
} else {
$item['enclosures'][] = '';
}
+ // add the subheadline as category. This will create a link in new articles
+ // and a text in old articles
+ $item['categories'][] = $article->find($categoryNode, 0)->innertext;
return $item;
}
- private function cleanContent($article){
+ private function cleanContent($article, $contentNode){
// the actual article is in this div
- $content = $article->find('div.layout-article-body', 0)->innertext;
+ $content = $article->find($contentNode, 0)->innertext;
// clean the article content. Remove all div's since the text is in paragraph elements
foreach (array(
'<div '