aboutsummaryrefslogtreecommitdiff
path: root/lib/XPathAbstract.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/XPathAbstract.php')
-rw-r--r--lib/XPathAbstract.php1162
1 files changed, 594 insertions, 568 deletions
diff --git a/lib/XPathAbstract.php b/lib/XPathAbstract.php
index 0ca1587b..686addf4 100644
--- a/lib/XPathAbstract.php
+++ b/lib/XPathAbstract.php
@@ -15,572 +15,598 @@
* This class extends {@see BridgeAbstract}, which means it incorporates and
* extends all of its functionality.
**/
-abstract class XPathAbstract extends BridgeAbstract {
-
- /**
- * Source Web page URL (should provide either HTML or XML content)
- * You can specify any website URL which serves data suited for display in RSS feeds
- * (for example a news blog).
- *
- * Use {@see XPathAbstract::getSourceUrl()} to read this parameter
- */
- const FEED_SOURCE_URL = '';
-
- /**
- * XPath expression for extracting the feed title from the source page.
- * If this is left blank or does not provide any data {@see BridgeAbstract::getName()}
- * is used instead as the feed's title.
- *
- * Use {@see XPathAbstract::getExpressionTitle()} to read this parameter
- */
- const XPATH_EXPRESSION_FEED_TITLE = './/title';
-
- /**
- * XPath expression for extracting the feed favicon URL from the source page.
- * If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()}
- * is used instead as the feed's favicon URL.
- *
- * Use {@see XPathAbstract::getExpressionIcon()} to read this parameter
- */
- const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href';
-
- /**
- * XPath expression for extracting the feed items from the source page
- * Enter an XPath expression matching a list of dom nodes, each node containing one
- * feed article item in total (usually a surrounding <div> or <span> tag). This will
- * be the context nodes for all of the following expressions. This expression usually
- * starts with a single forward slash.
- *
- * Use {@see XPathAbstract::getExpressionItem()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM = '';
-
- /**
- * XPath expression for extracting an item title from the item context
- * This expression should match a node contained within each article item node
- * containing the article headline. It should start with a dot followed by two
- * forward slashes, referring to any descendant nodes of the article item node.
- *
- * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_TITLE = '';
-
- /**
- * XPath expression for extracting an item's content from the item context
- * This expression should match a node contained within each article item node
- * containing the article content or description. It should start with a dot
- * followed by two forward slashes, referring to any descendant nodes of the
- * article item node.
- *
- * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_CONTENT = '';
-
- /**
- * XPath expression for extracting an item link from the item context
- * This expression should match a node's attribute containing the article URL
- * (usually the href attribute of an <a> tag). It should start with a dot
- * followed by two forward slashes, referring to any descendant nodes of
- * the article item node. Attributes can be selected by prepending an @ char
- * before the attributes name.
- *
- * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_URI = '';
-
- /**
- * XPath expression for extracting an item author from the item context
- * This expression should match a node contained within each article item
- * node containing the article author's name. It should start with a dot
- * followed by two forward slashes, referring to any descendant nodes of
- * the article item node.
- *
- * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_AUTHOR = '';
-
- /**
- * XPath expression for extracting an item timestamp from the item context
- * This expression should match a node or node's attribute containing the
- * article timestamp or date (parsable by PHP's strtotime function). It
- * should start with a dot followed by two forward slashes, referring to
- * any descendant nodes of the article item node. Attributes can be
- * selected by prepending an @ char before the attributes name.
- *
- * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_TIMESTAMP = '';
-
- /**
- * XPath expression for extracting item enclosures (media content like
- * images or movies) from the item context
- * This expression should match a node's attribute containing an article
- * image URL (usually the src attribute of an <img> tag or a style
- * attribute). It should start with a dot followed by two forward slashes,
- * referring to any descendant nodes of the article item node. Attributes
- * can be selected by prepending an @ char before the attributes name.
- *
- * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_ENCLOSURES = '';
-
- /**
- * XPath expression for extracting an item category from the item context
- * This expression should match a node or node's attribute contained
- * within each article item node containing the article category. This
- * could be inside <div> or <span> tags or sometimes be hidden
- * in a data attribute. It should start with a dot followed by two
- * forward slashes, referring to any descendant nodes of the article
- * item node. Attributes can be selected by prepending an @ char
- * before the attributes name.
- *
- * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter
- */
- const XPATH_EXPRESSION_ITEM_CATEGORIES = '';
-
- /**
- * Fix encoding
- * Set this to true for fixing feed encoding by invoking PHP's utf8_decode
- * function on all extracted texts. Try this in case you see "broken" or
- * "weird" characters in your feed where you'd normally expect umlauts
- * or any other non-ascii characters.
- *
- * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter
- */
- const SETTING_FIX_ENCODING = false;
-
- /**
- * Internal storage for resulting feed name, automatically detected
- * @var string
- */
- private $feedName;
-
- /**
- * Internal storage for resulting feed name, automatically detected
- * @var string
- */
- private $feedUri;
-
- /**
- * Internal storage for resulting feed favicon, automatically detected
- * @var string
- */
- private $feedIcon;
-
- public function getName(){
- return $this->feedName ?: parent::getName();
- }
-
- public function getURI() {
- return $this->feedUri ?: parent::getURI();
- }
-
- public function getIcon() {
- return $this->feedIcon ?: parent::getIcon();
- }
-
- /**
- * Source Web page URL (should provide either HTML or XML content)
- * @return string
- */
- protected function getSourceUrl(){
- return static::FEED_SOURCE_URL;
- }
-
- /**
- * XPath expression for extracting the feed title from the source page
- * @return string
- */
- protected function getExpressionTitle(){
- return static::XPATH_EXPRESSION_FEED_TITLE;
- }
-
- /**
- * XPath expression for extracting the feed favicon from the source page
- * @return string
- */
- protected function getExpressionIcon(){
- return static::XPATH_EXPRESSION_FEED_ICON;
- }
-
- /**
- * XPath expression for extracting the feed items from the source page
- * @return string
- */
- protected function getExpressionItem(){
- return static::XPATH_EXPRESSION_ITEM;
- }
-
- /**
- * XPath expression for extracting an item title from the item context
- * @return string
- */
- protected function getExpressionItemTitle(){
- return static::XPATH_EXPRESSION_ITEM_TITLE;
- }
-
- /**
- * XPath expression for extracting an item's content from the item context
- * @return string
- */
- protected function getExpressionItemContent(){
- return static::XPATH_EXPRESSION_ITEM_CONTENT;
- }
-
- /**
- * XPath expression for extracting an item link from the item context
- * @return string
- */
- protected function getExpressionItemUri(){
- return static::XPATH_EXPRESSION_ITEM_URI;
- }
-
- /**
- * XPath expression for extracting an item author from the item context
- * @return string
- */
- protected function getExpressionItemAuthor(){
- return static::XPATH_EXPRESSION_ITEM_AUTHOR;
- }
-
- /**
- * XPath expression for extracting an item timestamp from the item context
- * @return string
- */
- protected function getExpressionItemTimestamp(){
- return static::XPATH_EXPRESSION_ITEM_TIMESTAMP;
- }
-
- /**
- * XPath expression for extracting item enclosures (media content like
- * images or movies) from the item context
- * @return string
- */
- protected function getExpressionItemEnclosures(){
- return static::XPATH_EXPRESSION_ITEM_ENCLOSURES;
- }
-
- /**
- * XPath expression for extracting an item category from the item context
- * @return string
- */
- protected function getExpressionItemCategories(){
- return static::XPATH_EXPRESSION_ITEM_CATEGORIES;
- }
-
- /**
- * Fix encoding
- * @return string
- */
- protected function getSettingFixEncoding(){
- return static::SETTING_FIX_ENCODING;
- }
-
- /**
- * Internal helper method for quickly accessing all the user defined constants
- * in derived classes
- *
- * @param $name
- * @return bool|string
- */
- private function getParam($name){
- switch($name) {
-
- case 'url':
- return $this->getSourceUrl();
- case 'feed_title':
- return $this->getExpressionTitle();
- case 'feed_icon':
- return $this->getExpressionIcon();
- case 'item':
- return $this->getExpressionItem();
- case 'title':
- return $this->getExpressionItemTitle();
- case 'content':
- return $this->getExpressionItemContent();
- case 'uri':
- return $this->getExpressionItemUri();
- case 'author':
- return $this->getExpressionItemAuthor();
- case 'timestamp':
- return $this->getExpressionItemTimestamp();
- case 'enclosures':
- return $this->getExpressionItemEnclosures();
- case 'categories':
- return $this->getExpressionItemCategories();
- case 'fix_encoding':
- return $this->getSettingFixEncoding();
- }
- }
-
- /**
- * Should provide the source website HTML content
- * can be easily overwritten for example if special headers or auth infos are required
- * @return string
- */
- protected function provideWebsiteContent() {
- return getContents($this->feedUri);
- }
-
- /**
- * Should provide the feeds title
- *
- * @param DOMXPath $xpath
- * @return string
- */
- protected function provideFeedTitle(DOMXPath $xpath) {
- $title = $xpath->query($this->getParam('feed_title'));
- if(count($title) === 1) {
- return $this->getItemValueOrNodeValue($title);
- }
- }
-
- /**
- * Should provide the URL of the feed's favicon
- *
- * @param DOMXPath $xpath
- * @return string
- */
- protected function provideFeedIcon(DOMXPath $xpath) {
- $icon = $xpath->query($this->getParam('feed_icon'));
- if(count($icon) === 1) {
- return $this->cleanMediaUrl($this->getItemValueOrNodeValue($icon));
- }
- }
-
- /**
- * Should provide the feed's items.
- *
- * @param DOMXPath $xpath
- * @return DOMNodeList
- */
- protected function provideFeedItems(DOMXPath $xpath) {
- return @$xpath->query($this->getParam('item'));
- }
-
- public function collectData() {
-
- $this->feedUri = $this->getParam('url');
-
- $webPageHtml = new DOMDocument();
- libxml_use_internal_errors(true);
- $webPageHtml->loadHTML($this->provideWebsiteContent());
- libxml_clear_errors();
- libxml_use_internal_errors(false);
-
- $xpath = new DOMXPath($webPageHtml);
-
- $this->feedName = $this->provideFeedTitle($xpath);
- $this->feedIcon = $this->provideFeedIcon($xpath);
-
- $entries = $this->provideFeedItems($xpath);
- if($entries === false) {
- return;
- }
-
- foreach ($entries as $entry) {
- $item = new \FeedItem();
- foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) {
-
- $expression = $this->getParam($param);
- if('' === $expression) {
- continue;
- }
-
- //can be a string or DOMNodeList, depending on the expression result
- $typedResult = @$xpath->evaluate($expression, $entry);
- if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0)
- || (is_string($typedResult) && strlen(trim($typedResult)) === 0)) {
- continue;
- }
-
- $item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult)));
-
- }
-
- $itemId = $this->generateItemId($item);
- if(null !== $itemId) {
- $item->setUid($itemId);
- }
-
- $this->items[] = $item;
- }
-
- }
-
- /**
- * @param $param
- * @param $value
- * @return string|array
- */
- protected function formatParamValue($param, $value)
- {
- $value = $this->fixEncoding($value);
- switch ($param) {
- case 'title':
- return $this->formatItemTitle($value);
- case 'content':
- return $this->formatItemContent($value);
- case 'uri':
- return $this->formatItemUri($value);
- case 'author':
- return $this->formatItemAuthor($value);
- case 'timestamp':
- return $this->formatItemTimestamp($value);
- case 'enclosures':
- return $this->formatItemEnclosures($value);
- case 'categories':
- return $this->formatItemCategories($value);
- }
- return $value;
- }
-
- /**
- * Formats the title of a feed item. Takes extracted raw title and returns it formatted
- * as string.
- * Can be easily overwritten for in case the value needs to be transformed into something
- * else.
- * @param string $value
- * @return string
- */
- protected function formatItemTitle($value) {
- return $value;
- }
-
- /**
- * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
- * timestamp as integer.
- * Can be easily overwritten for example if a special format has to be expected on the
- * source website.
- * @param string $value
- * @return string
- */
- protected function formatItemContent($value) {
- return $value;
- }
-
- /**
- * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted
- * as string.
- * Can be easily overwritten for in case the value needs to be transformed into something
- * else.
- * @param string $value
- * @return string
- */
- protected function formatItemUri($value) {
- if(strlen($value) === 0) {
- return '';
- }
- if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) {
- return $value;
- }
-
- return urljoin($this->feedUri, $value);
- }
-
- /**
- * Formats the author of a feed item. Takes extracted raw author and returns it formatted
- * as string.
- * Can be easily overwritten for in case the value needs to be transformed into something
- * else.
- * @param string $value
- * @return string
- */
- protected function formatItemAuthor($value) {
- return $value;
- }
-
- /**
- * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
- * timestamp as integer.
- * Can be easily overwritten for example if a special format has to be expected on the
- * source website.
- * @param string $value
- * @return false|int
- */
- protected function formatItemTimestamp($value) {
- return strtotime($value);
- }
-
- /**
- * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them
- * formatted as array.
- * Can be easily overwritten for in case the values need to be transformed into something
- * else.
- * @param string $value
- * @return array
- */
- protected function formatItemEnclosures($value) {
- return array($this->cleanMediaUrl($value));
- }
-
- /**
- * Formats the categories of a feed item. Takes extracted raw categories and returns them
- * formatted as array.
- * Can be easily overwritten for in case the values need to be transformed into something
- * else.
- * @param string $value
- * @return array
- */
- protected function formatItemCategories($value) {
- return array($value);
- }
-
- /**
- * @param $mediaUrl
- * @return string|void
- */
- protected function cleanMediaUrl($mediaUrl)
- {
- $pattern = '~(?:http(?:s)?:)?[\/a-zA-Z0-9\-=_,\.\%]+\.(?:jpg|gif|png|jpeg|ico|mp3|webp){1}~i';
- $result = preg_match($pattern, $mediaUrl, $matches);
- if(1 !== $result) {
- return;
- }
- return urljoin($this->feedUri, $matches[0]);
- }
-
- /**
- * @param $typedResult
- * @return string
- */
- protected function getItemValueOrNodeValue($typedResult)
- {
- if($typedResult instanceof DOMNodeList) {
- $item = $typedResult->item(0);
- if ($item instanceof DOMElement) {
- return trim($item->nodeValue);
- } elseif ($item instanceof DOMAttr) {
- return trim($item->value);
- } elseif ($item instanceof DOMText) {
- return trim($item->wholeText);
- }
- } elseif(is_string($typedResult) && strlen($typedResult) > 0) {
- return trim($typedResult);
- }
- returnServerError('Unknown type of XPath expression result.');
- }
-
- /**
- * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts.
- * Useful in case of "broken" or "weird" characters in the feed where you'd normally
- * expect umlauts.
- *
- * @param $input
- * @return string
- */
- protected function fixEncoding($input)
- {
- return $this->getParam('fix_encoding') ? utf8_decode($input) : $input;
- }
-
- /**
- * Allows overriding default mechanism determining items Uid's
- *
- * @param FeedItem $item
- * @return string|null
- */
- protected function generateItemId(\FeedItem $item) {
- return null; //auto generation
- }
+abstract class XPathAbstract extends BridgeAbstract
+{
+ /**
+ * Source Web page URL (should provide either HTML or XML content)
+ * You can specify any website URL which serves data suited for display in RSS feeds
+ * (for example a news blog).
+ *
+ * Use {@see XPathAbstract::getSourceUrl()} to read this parameter
+ */
+ const FEED_SOURCE_URL = '';
+
+ /**
+ * XPath expression for extracting the feed title from the source page.
+ * If this is left blank or does not provide any data {@see BridgeAbstract::getName()}
+ * is used instead as the feed's title.
+ *
+ * Use {@see XPathAbstract::getExpressionTitle()} to read this parameter
+ */
+ const XPATH_EXPRESSION_FEED_TITLE = './/title';
+
+ /**
+ * XPath expression for extracting the feed favicon URL from the source page.
+ * If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()}
+ * is used instead as the feed's favicon URL.
+ *
+ * Use {@see XPathAbstract::getExpressionIcon()} to read this parameter
+ */
+ const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href';
+
+ /**
+ * XPath expression for extracting the feed items from the source page
+ * Enter an XPath expression matching a list of dom nodes, each node containing one
+ * feed article item in total (usually a surrounding <div> or <span> tag). This will
+ * be the context nodes for all of the following expressions. This expression usually
+ * starts with a single forward slash.
+ *
+ * Use {@see XPathAbstract::getExpressionItem()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM = '';
+
+ /**
+ * XPath expression for extracting an item title from the item context
+ * This expression should match a node contained within each article item node
+ * containing the article headline. It should start with a dot followed by two
+ * forward slashes, referring to any descendant nodes of the article item node.
+ *
+ * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_TITLE = '';
+
+ /**
+ * XPath expression for extracting an item's content from the item context
+ * This expression should match a node contained within each article item node
+ * containing the article content or description. It should start with a dot
+ * followed by two forward slashes, referring to any descendant nodes of the
+ * article item node.
+ *
+ * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_CONTENT = '';
+
+ /**
+ * XPath expression for extracting an item link from the item context
+ * This expression should match a node's attribute containing the article URL
+ * (usually the href attribute of an <a> tag). It should start with a dot
+ * followed by two forward slashes, referring to any descendant nodes of
+ * the article item node. Attributes can be selected by prepending an @ char
+ * before the attributes name.
+ *
+ * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_URI = '';
+
+ /**
+ * XPath expression for extracting an item author from the item context
+ * This expression should match a node contained within each article item
+ * node containing the article author's name. It should start with a dot
+ * followed by two forward slashes, referring to any descendant nodes of
+ * the article item node.
+ *
+ * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_AUTHOR = '';
+
+ /**
+ * XPath expression for extracting an item timestamp from the item context
+ * This expression should match a node or node's attribute containing the
+ * article timestamp or date (parsable by PHP's strtotime function). It
+ * should start with a dot followed by two forward slashes, referring to
+ * any descendant nodes of the article item node. Attributes can be
+ * selected by prepending an @ char before the attributes name.
+ *
+ * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_TIMESTAMP = '';
+
+ /**
+ * XPath expression for extracting item enclosures (media content like
+ * images or movies) from the item context
+ * This expression should match a node's attribute containing an article
+ * image URL (usually the src attribute of an <img> tag or a style
+ * attribute). It should start with a dot followed by two forward slashes,
+ * referring to any descendant nodes of the article item node. Attributes
+ * can be selected by prepending an @ char before the attributes name.
+ *
+ * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_ENCLOSURES = '';
+
+ /**
+ * XPath expression for extracting an item category from the item context
+ * This expression should match a node or node's attribute contained
+ * within each article item node containing the article category. This
+ * could be inside <div> or <span> tags or sometimes be hidden
+ * in a data attribute. It should start with a dot followed by two
+ * forward slashes, referring to any descendant nodes of the article
+ * item node. Attributes can be selected by prepending an @ char
+ * before the attributes name.
+ *
+ * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter
+ */
+ const XPATH_EXPRESSION_ITEM_CATEGORIES = '';
+
+ /**
+ * Fix encoding
+ * Set this to true for fixing feed encoding by invoking PHP's utf8_decode
+ * function on all extracted texts. Try this in case you see "broken" or
+ * "weird" characters in your feed where you'd normally expect umlauts
+ * or any other non-ascii characters.
+ *
+ * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter
+ */
+ const SETTING_FIX_ENCODING = false;
+
+ /**
+ * Internal storage for resulting feed name, automatically detected
+ * @var string
+ */
+ private $feedName;
+
+ /**
+ * Internal storage for resulting feed name, automatically detected
+ * @var string
+ */
+ private $feedUri;
+
+ /**
+ * Internal storage for resulting feed favicon, automatically detected
+ * @var string
+ */
+ private $feedIcon;
+
+ public function getName()
+ {
+ return $this->feedName ?: parent::getName();
+ }
+
+ public function getURI()
+ {
+ return $this->feedUri ?: parent::getURI();
+ }
+
+ public function getIcon()
+ {
+ return $this->feedIcon ?: parent::getIcon();
+ }
+
+ /**
+ * Source Web page URL (should provide either HTML or XML content)
+ * @return string
+ */
+ protected function getSourceUrl()
+ {
+ return static::FEED_SOURCE_URL;
+ }
+
+ /**
+ * XPath expression for extracting the feed title from the source page
+ * @return string
+ */
+ protected function getExpressionTitle()
+ {
+ return static::XPATH_EXPRESSION_FEED_TITLE;
+ }
+
+ /**
+ * XPath expression for extracting the feed favicon from the source page
+ * @return string
+ */
+ protected function getExpressionIcon()
+ {
+ return static::XPATH_EXPRESSION_FEED_ICON;
+ }
+
+ /**
+ * XPath expression for extracting the feed items from the source page
+ * @return string
+ */
+ protected function getExpressionItem()
+ {
+ return static::XPATH_EXPRESSION_ITEM;
+ }
+
+ /**
+ * XPath expression for extracting an item title from the item context
+ * @return string
+ */
+ protected function getExpressionItemTitle()
+ {
+ return static::XPATH_EXPRESSION_ITEM_TITLE;
+ }
+
+ /**
+ * XPath expression for extracting an item's content from the item context
+ * @return string
+ */
+ protected function getExpressionItemContent()
+ {
+ return static::XPATH_EXPRESSION_ITEM_CONTENT;
+ }
+
+ /**
+ * XPath expression for extracting an item link from the item context
+ * @return string
+ */
+ protected function getExpressionItemUri()
+ {
+ return static::XPATH_EXPRESSION_ITEM_URI;
+ }
+
+ /**
+ * XPath expression for extracting an item author from the item context
+ * @return string
+ */
+ protected function getExpressionItemAuthor()
+ {
+ return static::XPATH_EXPRESSION_ITEM_AUTHOR;
+ }
+
+ /**
+ * XPath expression for extracting an item timestamp from the item context
+ * @return string
+ */
+ protected function getExpressionItemTimestamp()
+ {
+ return static::XPATH_EXPRESSION_ITEM_TIMESTAMP;
+ }
+
+ /**
+ * XPath expression for extracting item enclosures (media content like
+ * images or movies) from the item context
+ * @return string
+ */
+ protected function getExpressionItemEnclosures()
+ {
+ return static::XPATH_EXPRESSION_ITEM_ENCLOSURES;
+ }
+
+ /**
+ * XPath expression for extracting an item category from the item context
+ * @return string
+ */
+ protected function getExpressionItemCategories()
+ {
+ return static::XPATH_EXPRESSION_ITEM_CATEGORIES;
+ }
+
+ /**
+ * Fix encoding
+ * @return string
+ */
+ protected function getSettingFixEncoding()
+ {
+ return static::SETTING_FIX_ENCODING;
+ }
+
+ /**
+ * Internal helper method for quickly accessing all the user defined constants
+ * in derived classes
+ *
+ * @param $name
+ * @return bool|string
+ */
+ private function getParam($name)
+ {
+ switch ($name) {
+ case 'url':
+ return $this->getSourceUrl();
+ case 'feed_title':
+ return $this->getExpressionTitle();
+ case 'feed_icon':
+ return $this->getExpressionIcon();
+ case 'item':
+ return $this->getExpressionItem();
+ case 'title':
+ return $this->getExpressionItemTitle();
+ case 'content':
+ return $this->getExpressionItemContent();
+ case 'uri':
+ return $this->getExpressionItemUri();
+ case 'author':
+ return $this->getExpressionItemAuthor();
+ case 'timestamp':
+ return $this->getExpressionItemTimestamp();
+ case 'enclosures':
+ return $this->getExpressionItemEnclosures();
+ case 'categories':
+ return $this->getExpressionItemCategories();
+ case 'fix_encoding':
+ return $this->getSettingFixEncoding();
+ }
+ }
+
+ /**
+ * Should provide the source website HTML content
+ * can be easily overwritten for example if special headers or auth infos are required
+ * @return string
+ */
+ protected function provideWebsiteContent()
+ {
+ return getContents($this->feedUri);
+ }
+
+ /**
+ * Should provide the feeds title
+ *
+ * @param DOMXPath $xpath
+ * @return string
+ */
+ protected function provideFeedTitle(DOMXPath $xpath)
+ {
+ $title = $xpath->query($this->getParam('feed_title'));
+ if (count($title) === 1) {
+ return $this->getItemValueOrNodeValue($title);
+ }
+ }
+
+ /**
+ * Should provide the URL of the feed's favicon
+ *
+ * @param DOMXPath $xpath
+ * @return string
+ */
+ protected function provideFeedIcon(DOMXPath $xpath)
+ {
+ $icon = $xpath->query($this->getParam('feed_icon'));
+ if (count($icon) === 1) {
+ return $this->cleanMediaUrl($this->getItemValueOrNodeValue($icon));
+ }
+ }
+
+ /**
+ * Should provide the feed's items.
+ *
+ * @param DOMXPath $xpath
+ * @return DOMNodeList
+ */
+ protected function provideFeedItems(DOMXPath $xpath)
+ {
+ return @$xpath->query($this->getParam('item'));
+ }
+
+ public function collectData()
+ {
+ $this->feedUri = $this->getParam('url');
+
+ $webPageHtml = new DOMDocument();
+ libxml_use_internal_errors(true);
+ $webPageHtml->loadHTML($this->provideWebsiteContent());
+ libxml_clear_errors();
+ libxml_use_internal_errors(false);
+
+ $xpath = new DOMXPath($webPageHtml);
+
+ $this->feedName = $this->provideFeedTitle($xpath);
+ $this->feedIcon = $this->provideFeedIcon($xpath);
+
+ $entries = $this->provideFeedItems($xpath);
+ if ($entries === false) {
+ return;
+ }
+
+ foreach ($entries as $entry) {
+ $item = new \FeedItem();
+ foreach (['title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories'] as $param) {
+ $expression = $this->getParam($param);
+ if ('' === $expression) {
+ continue;
+ }
+
+ //can be a string or DOMNodeList, depending on the expression result
+ $typedResult = @$xpath->evaluate($expression, $entry);
+ if (
+ $typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0)
+ || (is_string($typedResult) && strlen(trim($typedResult)) === 0)
+ ) {
+ continue;
+ }
+
+ $item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult)));
+ }
+
+ $itemId = $this->generateItemId($item);
+ if (null !== $itemId) {
+ $item->setUid($itemId);
+ }
+
+ $this->items[] = $item;
+ }
+ }
+
+ /**
+ * @param $param
+ * @param $value
+ * @return string|array
+ */
+ protected function formatParamValue($param, $value)
+ {
+ $value = $this->fixEncoding($value);
+ switch ($param) {
+ case 'title':
+ return $this->formatItemTitle($value);
+ case 'content':
+ return $this->formatItemContent($value);
+ case 'uri':
+ return $this->formatItemUri($value);
+ case 'author':
+ return $this->formatItemAuthor($value);
+ case 'timestamp':
+ return $this->formatItemTimestamp($value);
+ case 'enclosures':
+ return $this->formatItemEnclosures($value);
+ case 'categories':
+ return $this->formatItemCategories($value);
+ }
+ return $value;
+ }
+
+ /**
+ * Formats the title of a feed item. Takes extracted raw title and returns it formatted
+ * as string.
+ * Can be easily overwritten for in case the value needs to be transformed into something
+ * else.
+ * @param string $value
+ * @return string
+ */
+ protected function formatItemTitle($value)
+ {
+ return $value;
+ }
+
+ /**
+ * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
+ * timestamp as integer.
+ * Can be easily overwritten for example if a special format has to be expected on the
+ * source website.
+ * @param string $value
+ * @return string
+ */
+ protected function formatItemContent($value)
+ {
+ return $value;
+ }
+
+ /**
+ * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted
+ * as string.
+ * Can be easily overwritten for in case the value needs to be transformed into something
+ * else.
+ * @param string $value
+ * @return string
+ */
+ protected function formatItemUri($value)
+ {
+ if (strlen($value) === 0) {
+ return '';
+ }
+ if (strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) {
+ return $value;
+ }
+
+ return urljoin($this->feedUri, $value);
+ }
+
+ /**
+ * Formats the author of a feed item. Takes extracted raw author and returns it formatted
+ * as string.
+ * Can be easily overwritten for in case the value needs to be transformed into something
+ * else.
+ * @param string $value
+ * @return string
+ */
+ protected function formatItemAuthor($value)
+ {
+ return $value;
+ }
+
+ /**
+ * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
+ * timestamp as integer.
+ * Can be easily overwritten for example if a special format has to be expected on the
+ * source website.
+ * @param string $value
+ * @return false|int
+ */
+ protected function formatItemTimestamp($value)
+ {
+ return strtotime($value);
+ }
+
+ /**
+ * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them
+ * formatted as array.
+ * Can be easily overwritten for in case the values need to be transformed into something
+ * else.
+ * @param string $value
+ * @return array
+ */
+ protected function formatItemEnclosures($value)
+ {
+ return [$this->cleanMediaUrl($value)];
+ }
+
+ /**
+ * Formats the categories of a feed item. Takes extracted raw categories and returns them
+ * formatted as array.
+ * Can be easily overwritten for in case the values need to be transformed into something
+ * else.
+ * @param string $value
+ * @return array
+ */
+ protected function formatItemCategories($value)
+ {
+ return [$value];
+ }
+
+ /**
+ * @param $mediaUrl
+ * @return string|void
+ */
+ protected function cleanMediaUrl($mediaUrl)
+ {
+ $pattern = '~(?:http(?:s)?:)?[\/a-zA-Z0-9\-=_,\.\%]+\.(?:jpg|gif|png|jpeg|ico|mp3|webp){1}~i';
+ $result = preg_match($pattern, $mediaUrl, $matches);
+ if (1 !== $result) {
+ return;
+ }
+ return urljoin($this->feedUri, $matches[0]);
+ }
+
+ /**
+ * @param $typedResult
+ * @return string
+ */
+ protected function getItemValueOrNodeValue($typedResult)
+ {
+ if ($typedResult instanceof DOMNodeList) {
+ $item = $typedResult->item(0);
+ if ($item instanceof DOMElement) {
+ return trim($item->nodeValue);
+ } elseif ($item instanceof DOMAttr) {
+ return trim($item->value);
+ } elseif ($item instanceof DOMText) {
+ return trim($item->wholeText);
+ }
+ } elseif (is_string($typedResult) && strlen($typedResult) > 0) {
+ return trim($typedResult);
+ }
+ returnServerError('Unknown type of XPath expression result.');
+ }
+
+ /**
+ * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts.
+ * Useful in case of "broken" or "weird" characters in the feed where you'd normally
+ * expect umlauts.
+ *
+ * @param $input
+ * @return string
+ */
+ protected function fixEncoding($input)
+ {
+ return $this->getParam('fix_encoding') ? utf8_decode($input) : $input;
+ }
+
+ /**
+ * Allows overriding default mechanism determining items Uid's
+ *
+ * @param FeedItem $item
+ * @return string|null
+ */
+ protected function generateItemId(\FeedItem $item)
+ {
+ return null; //auto generation
+ }
}