aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bridges/CssSelectorFeedExpanderBridge.php4
-rw-r--r--bridges/NyaaTorrentsBridge.php43
-rw-r--r--composer.json4
-rw-r--r--formats/AtomFormat.php18
-rw-r--r--formats/MrssFormat.php30
-rw-r--r--lib/FeedParser.php100
-rw-r--r--lib/FormatAbstract.php2
7 files changed, 121 insertions, 80 deletions
diff --git a/bridges/CssSelectorFeedExpanderBridge.php b/bridges/CssSelectorFeedExpanderBridge.php
index 9f332fb9..49bbd473 100644
--- a/bridges/CssSelectorFeedExpanderBridge.php
+++ b/bridges/CssSelectorFeedExpanderBridge.php
@@ -50,7 +50,9 @@ class CssSelectorFeedExpanderBridge extends CssSelectorBridge
$discard_thumbnail = $this->getInput('discard_thumbnail');
$limit = $this->getInput('limit');
- $source_feed = (new FeedParser())->parseFeed(getContents($url));
+ $feedParser = new FeedParser();
+ $xml = getContents($url);
+ $source_feed = $feedParser->parseFeed($xml);
$items = $source_feed['items'];
// Map Homepage URL (Default: Root page)
diff --git a/bridges/NyaaTorrentsBridge.php b/bridges/NyaaTorrentsBridge.php
index f7eea07f..fcf2b197 100644
--- a/bridges/NyaaTorrentsBridge.php
+++ b/bridges/NyaaTorrentsBridge.php
@@ -62,52 +62,27 @@ class NyaaTorrentsBridge extends BridgeAbstract
public function collectData()
{
- // Manually parsing because we need to acccess the nyaa namespace in the xml
- $xml = simplexml_load_string(getContents($this->getURI()));
- $channel = $xml->channel[0];
- $feed = [];
- $feed['title'] = trim((string)$channel->title);
- $feed['uri'] = trim((string)$channel->link);
- if (!empty($channel->image)) {
- $feed['icon'] = trim((string)$channel->image->url);
- }
- $items = $xml->channel[0]->item;
- foreach ($items as $feedItem) {
- $item = [
- 'title' => (string) $feedItem->title,
- 'uri' => (string) $feedItem->link,
- ];
-
+ $feedParser = new FeedParser();
+ $feed = $feedParser->parseFeed(getContents($this->getURI()));
+ foreach ($feed['items'] as $item) {
$item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']);
-
- $nyaaNamespace = (array)($feedItem->children('nyaa', true));
- $item = array_merge($item, $nyaaNamespace);
-
- // Convert URI from torrent file to web page
$item['uri'] = str_replace('/download/', '/view/', $item['uri']);
$item['uri'] = str_replace('.torrent', '', $item['uri']);
-
- $item_html = getSimpleHTMLDOMCached($item['uri']);
- if ($item_html) {
- // Retrieve full description from page contents
- $item_desc = str_get_html(
- markdownToHtml(html_entity_decode($item_html->find('#torrent-description', 0)->innertext))
- );
-
- // Retrieve image for thumbnail or generic logo fallback
+ $dom = getSimpleHTMLDOMCached($item['uri']);
+ if ($dom) {
+ $description = $dom->find('#torrent-description', 0)->innertext ?? '';
+ $itemDom = str_get_html(markdownToHtml(html_entity_decode($description)));
$item_image = $this->getURI() . 'static/img/avatar/default.png';
- foreach ($item_desc->find('img') as $img) {
+ foreach ($itemDom->find('img') as $img) {
if (strpos($img->src, 'prez') === false) {
$item_image = $img->src;
break;
}
}
-
$item['enclosures'] = [$item_image];
- $item['content'] = $item_desc;
+ $item['content'] = (string) $itemDom;
}
-
$this->items[] = $item;
if (count($this->items) >= 10) {
break;
diff --git a/composer.json b/composer.json
index 31e31d74..0e7abb84 100644
--- a/composer.json
+++ b/composer.json
@@ -28,6 +28,7 @@
"ext-openssl": "*",
"ext-libxml": "*",
"ext-simplexml": "*",
+ "ext-dom": "*",
"ext-json": "*"
},
"require-dev": {
@@ -38,8 +39,7 @@
"ext-memcached": "Allows to use memcached as cache type",
"ext-sqlite3": "Allows to use an SQLite database for caching",
"ext-zip": "Required for FDroidRepoBridge",
- "ext-intl": "Required for OLXBridge",
- "ext-dom": "Allows to use some bridges based on XPath expressions"
+ "ext-intl": "Required for OLXBridge"
},
"autoload-dev": {
"psr-4": {
diff --git a/formats/AtomFormat.php b/formats/AtomFormat.php
index 9886e4b7..d59e42fe 100644
--- a/formats/AtomFormat.php
+++ b/formats/AtomFormat.php
@@ -16,6 +16,8 @@ class AtomFormat extends FormatAbstract
public function stringify()
{
+ $document = new \DomDocument('1.0', $this->getCharset());
+
$feedUrl = get_current_url();
$extraInfos = $this->getExtraInfos();
@@ -25,7 +27,6 @@ class AtomFormat extends FormatAbstract
$uri = $extraInfos['uri'];
}
- $document = new \DomDocument('1.0', $this->getCharset());
$document->formatOutput = true;
$feed = $document->createElementNS(self::ATOM_NS, 'feed');
$document->appendChild($feed);
@@ -81,6 +82,7 @@ class AtomFormat extends FormatAbstract
$linkSelf->setAttribute('href', $feedUrl);
foreach ($this->getItems() as $item) {
+ $itemArray = $item->toArray();
$entryTimestamp = $item->getTimestamp();
$entryTitle = $item->getTitle();
$entryContent = $item->getContent();
@@ -138,7 +140,19 @@ class AtomFormat extends FormatAbstract
$entry->appendChild($id);
$id->appendChild($document->createTextNode($entryID));
- if (!empty($entryUri)) {
+ if (isset($itemArray['itunes'])) {
+ $feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
+ foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
+ $itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
+ $entry->appendChild($itunesProperty);
+ $itunesProperty->appendChild($document->createTextNode($itunesValue));
+ }
+ $itunesEnclosure = $document->createElement('enclosure');
+ $entry->appendChild($itunesEnclosure);
+ $itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
+ $itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
+ $itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
+ } elseif (!empty($entryUri)) {
$entryLinkAlternate = $document->createElement('link');
$entry->appendChild($entryLinkAlternate);
$entryLinkAlternate->setAttribute('rel', 'alternate');
diff --git a/formats/MrssFormat.php b/formats/MrssFormat.php
index 984611c7..4fd06439 100644
--- a/formats/MrssFormat.php
+++ b/formats/MrssFormat.php
@@ -34,6 +34,8 @@ class MrssFormat extends FormatAbstract
public function stringify()
{
+ $document = new \DomDocument('1.0', $this->getCharset());
+
$feedUrl = get_current_url();
$extraInfos = $this->getExtraInfos();
if (empty($extraInfos['uri'])) {
@@ -42,7 +44,6 @@ class MrssFormat extends FormatAbstract
$uri = $extraInfos['uri'];
}
- $document = new \DomDocument('1.0', $this->getCharset());
$document->formatOutput = true;
$feed = $document->createElement('rss');
$document->appendChild($feed);
@@ -99,22 +100,23 @@ class MrssFormat extends FormatAbstract
$linkSelf->setAttribute('href', $feedUrl);
foreach ($this->getItems() as $item) {
+ $itemArray = $item->toArray();
$itemTimestamp = $item->getTimestamp();
$itemTitle = $item->getTitle();
$itemUri = $item->getURI();
$itemContent = $item->getContent() ? break_annoying_html_tags($item->getContent()) : '';
- $entryID = $item->getUid();
+ $itemUid = $item->getUid();
$isPermaLink = 'false';
- if (empty($entryID) && !empty($itemUri)) {
+ if (empty($itemUid) && !empty($itemUri)) {
// Fallback to provided URI
- $entryID = $itemUri;
+ $itemUid = $itemUri;
$isPermaLink = 'true';
}
- if (empty($entryID)) {
+ if (empty($itemUid)) {
// Fallback to title and content
- $entryID = hash('sha1', $itemTitle . $itemContent);
+ $itemUid = hash('sha1', $itemTitle . $itemContent);
}
$entry = $document->createElement('item');
@@ -126,7 +128,19 @@ class MrssFormat extends FormatAbstract
$entryTitle->appendChild($document->createTextNode($itemTitle));
}
- if (!empty($itemUri)) {
+ if (isset($itemArray['itunes'])) {
+ $feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
+ foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
+ $itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
+ $entry->appendChild($itunesProperty);
+ $itunesProperty->appendChild($document->createTextNode($itunesValue));
+ }
+ $itunesEnclosure = $document->createElement('enclosure');
+ $entry->appendChild($itunesEnclosure);
+ $itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
+ $itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
+ $itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
+ } if (!empty($itemUri)) {
$entryLink = $document->createElement('link');
$entry->appendChild($entryLink);
$entryLink->appendChild($document->createTextNode($itemUri));
@@ -135,7 +149,7 @@ class MrssFormat extends FormatAbstract
$entryGuid = $document->createElement('guid');
$entryGuid->setAttribute('isPermaLink', $isPermaLink);
$entry->appendChild($entryGuid);
- $entryGuid->appendChild($document->createTextNode($entryID));
+ $entryGuid->appendChild($document->createTextNode($itemUid));
if (!empty($itemTimestamp)) {
$entryPublished = $document->createElement('pubDate');
diff --git a/lib/FeedParser.php b/lib/FeedParser.php
index 1393f5f5..2d982de1 100644
--- a/lib/FeedParser.php
+++ b/lib/FeedParser.php
@@ -3,11 +3,13 @@
declare(strict_types=1);
/**
- * Very basic and naive feed parser that srapes out rss 0.91, 1.0, 2.0 and atom 1.0.
+ * Very basic and naive feed parser.
*
- * Emit arrays meant to be used inside rss-bridge.
+ * Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0.
*
- * The feed item structure is identical to that of FeedItem
+ * Produce arrays meant to be used inside rss-bridge.
+ *
+ * The item structure is tweaked so that works with FeedItem
*/
final class FeedParser
{
@@ -85,9 +87,7 @@ final class FeedParser
public function parseAtomItem(\SimpleXMLElement $feedItem): array
{
- // Some ATOM entries also contain RSS 2.0 fields
$item = $this->parseRss2Item($feedItem);
-
if (isset($feedItem->id)) {
$item['uri'] = (string)$feedItem->id;
}
@@ -131,8 +131,35 @@ final class FeedParser
public function parseRss2Item(\SimpleXMLElement $feedItem): array
{
- // Primary data is compatible to 0.91 with some additional data
- $item = $this->parseRss091Item($feedItem);
+ $item = [
+ 'uri' => '',
+ 'title' => '',
+ 'content' => '',
+ 'timestamp' => '',
+ 'author' => '',
+ //'uid' => null,
+ //'categories' => [],
+ //'enclosures' => [],
+ ];
+
+ foreach ($feedItem as $k => $v) {
+ $hasChildren = count($v) !== 0;
+ if (!$hasChildren) {
+ $item[$k] = (string) $v;
+ }
+ }
+
+ if (isset($feedItem->link)) {
+ // todo: trim uri
+ $item['uri'] = (string)$feedItem->link;
+ }
+ if (isset($feedItem->title)) {
+ $item['title'] = html_entity_decode((string)$feedItem->title);
+ }
+ if (isset($feedItem->description)) {
+ $item['content'] = (string)$feedItem->description;
+ }
+
$namespaces = $feedItem->getNamespaces(true);
if (isset($namespaces['dc'])) {
$dc = $feedItem->children($namespaces['dc']);
@@ -140,7 +167,24 @@ final class FeedParser
if (isset($namespaces['media'])) {
$media = $feedItem->children($namespaces['media']);
}
-
+ foreach ($namespaces as $namespaceName => $namespaceUrl) {
+ if (in_array($namespaceName, ['', 'content', 'media'])) {
+ continue;
+ }
+ $module = $feedItem->children($namespaceUrl);
+ $item[$namespaceName] = [];
+ foreach ($module as $moduleKey => $moduleValue) {
+ $item[$namespaceName][$moduleKey] = (string) $moduleValue;
+ }
+ }
+ if (isset($namespaces['itunes'])) {
+ $enclosure = $feedItem->enclosure;
+ $item['enclosure'] = [
+ 'url' => (string) $enclosure['url'],
+ 'length' => (string) $enclosure['length'],
+ 'type' => (string) $enclosure['type'],
+ ];
+ }
if (isset($feedItem->guid)) {
// Pluck out a url from guid
foreach ($feedItem->guid->attributes() as $attribute => $value) {
@@ -185,29 +229,12 @@ final class FeedParser
public function parseRss1Item(\SimpleXMLElement $feedItem): array
{
- // 1.0 adds optional elements around the 0.91 standard
- $item = $this->parseRss091Item($feedItem);
- $namespaces = $feedItem->getNamespaces(true);
- if (isset($namespaces['dc'])) {
- $dc = $feedItem->children($namespaces['dc']);
- if (isset($dc->date)) {
- $item['timestamp'] = strtotime((string)$dc->date);
- }
- if (isset($dc->creator)) {
- $item['author'] = (string)$dc->creator;
- }
- }
- return $item;
- }
-
- public function parseRss091Item(\SimpleXMLElement $feedItem): array
- {
$item = [
- 'uri' => null,
- 'title' => null,
- 'content' => null,
- 'timestamp' => null,
- 'author' => null,
+ 'uri' => '',
+ 'title' => '',
+ 'content' => '',
+ 'timestamp' => '',
+ 'author' => '',
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
@@ -219,12 +246,19 @@ final class FeedParser
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
- // rss 0.91 doesn't support timestamps
- // rss 0.91 doesn't support authors
- // rss 0.91 doesn't support enclosures
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
+ $namespaces = $feedItem->getNamespaces(true);
+ if (isset($namespaces['dc'])) {
+ $dc = $feedItem->children($namespaces['dc']);
+ if (isset($dc->date)) {
+ $item['timestamp'] = strtotime((string)$dc->date);
+ }
+ if (isset($dc->creator)) {
+ $item['author'] = (string)$dc->creator;
+ }
+ }
return $item;
}
}
diff --git a/lib/FormatAbstract.php b/lib/FormatAbstract.php
index b05a5764..c76d1e42 100644
--- a/lib/FormatAbstract.php
+++ b/lib/FormatAbstract.php
@@ -2,6 +2,8 @@
abstract class FormatAbstract
{
+ public const ITUNES_NS = 'http://www.itunes.com/dtds/podcast-1.0.dtd';
+
const MIME_TYPE = 'text/plain';
protected string $charset = 'UTF-8';