aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar hleskien <34342248+hleskien@users.noreply.github.com> 2024-02-10 04:42:22 +0100
committerGravatar GitHub <noreply@github.com> 2024-02-10 04:42:22 +0100
commit8e8028b7860ebb09eae92dcd43b4b6916d26d4d6 (patch)
tree28e27119878294c066be11c8a89a10ecdd682d39
parentff7840d60f212dd5f395378103bc22f0c273641d (diff)
downloadrss-bridge-8e8028b7860ebb09eae92dcd43b4b6916d26d4d6.tar.gz
rss-bridge-8e8028b7860ebb09eae92dcd43b4b6916d26d4d6.tar.zst
rss-bridge-8e8028b7860ebb09eae92dcd43b4b6916d26d4d6.zip
Adopt WebDriverAbstract as a solution for active (JavaScript) websites (#3971)
* first working version --------- Co-authored-by: Dag <me@dvikan.no>
-rw-r--r--bridges/GULPProjekteBridge.php164
-rw-r--r--bridges/ScalableCapitalBlogBridge.php73
-rw-r--r--config.default.ini.php10
-rw-r--r--docs/05_Bridge_API/04_WebDriverAbstract.md83
-rw-r--r--docs/05_Bridge_API/05_XPathAbstract.md (renamed from docs/05_Bridge_API/04_XPathAbstract.md)0
-rw-r--r--docs/05_Bridge_API/index.md3
-rw-r--r--lib/WebDriverAbstract.php141
7 files changed, 473 insertions, 1 deletions
diff --git a/bridges/GULPProjekteBridge.php b/bridges/GULPProjekteBridge.php
new file mode 100644
index 00000000..e0bb8cbe
--- /dev/null
+++ b/bridges/GULPProjekteBridge.php
@@ -0,0 +1,164 @@
+<?php
+
+use Facebook\WebDriver\Exception\NoSuchElementException;
+use Facebook\WebDriver\Remote\RemoteWebElement;
+use Facebook\WebDriver\WebDriverBy;
+use Facebook\WebDriver\WebDriverExpectedCondition;
+
+class GULPProjekteBridge extends WebDriverAbstract
+{
+ const NAME = 'GULP Projekte';
+ const URI = 'https://www.gulp.de/gulp2/g/projekte';
+ const DESCRIPTION = 'Projektsuche';
+ const MAINTAINER = 'hleskien';
+
+ const MAXITEMS = 60;
+
+ /**
+ * Adds accept language german to the Chrome Options.
+ *
+ * @return Facebook\WebDriver\Chrome\ChromeOptions
+ */
+ protected function getBrowserOptions()
+ {
+ $chromeOptions = parent::getBrowserOptions();
+ $chromeOptions->addArguments(['--accept-lang=de']);
+ return $chromeOptions;
+ }
+
+ /**
+ * @throws Facebook\WebDriver\Exception\NoSuchElementException
+ * @throws Facebook\WebDriver\Exception\TimeoutException
+ */
+ protected function clickAwayCookieBanner()
+ {
+ $this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
+ $buttonRejectCookies = $this->getDriver()->findElement(WebDriverBy::id('onetrust-reject-all-handler'));
+ $buttonRejectCookies->click();
+ $this->getDriver()->wait()->until(WebDriverExpectedCondition::invisibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
+ }
+
+ /**
+ * @throws Facebook\WebDriver\Exception\NoSuchElementException
+ * @throws Facebook\WebDriver\Exception\TimeoutException
+ */
+ protected function clickNextPage()
+ {
+ $nextPage = $this->getDriver()->findElement(WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a'));
+ $href = $nextPage->getAttribute('href');
+ $nextPage->click();
+ $this->getDriver()->wait()->until(WebDriverExpectedCondition::not(
+ WebDriverExpectedCondition::presenceOfElementLocated(
+ WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a[@href="' . $href . '"]')
+ )
+ ));
+ }
+
+ /**
+ * Returns the uri of the 'Projektanbieter' logo or false if there is
+ * no logo present in the item.
+ *
+ * @return string | false
+ */
+ protected function getLogo(RemoteWebElement $item)
+ {
+ try {
+ $logo = $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src');
+ if (str_starts_with($logo, 'http')) {
+ // different domain
+ return $logo;
+ } else {
+ // relative path
+ $remove = substr(self::URI, strrpos(self::URI, '/') + 1);
+ return substr(self::URI, 0, -strlen($remove)) . $logo;
+ }
+ } catch (NoSuchElementException $e) {
+ return false;
+ }
+ }
+
+ /**
+ * Converts a string like "vor einigen Minuten" into a reasonable timestamp.
+ * Long and complicated, but we don't want to be more specific than
+ * the information we have available.
+ *
+ * @throws Exception If the DateInterval can't be parsed.
+ */
+ protected function getTimestamp(string $timeAgo): int
+ {
+ $dateTime = new DateTime();
+ $dateArray = explode(' ', $dateTime->format('Y m d H i s'));
+ $quantityStr = explode(' ', $timeAgo)[1];
+ // convert possible word into a number
+ if (in_array($quantityStr, ['einem', 'einer', 'einigen'])) {
+ $quantity = 1;
+ } else {
+ $quantity = intval($quantityStr);
+ }
+ // subtract time ago + inferior units for lower precision
+ if (str_contains($timeAgo, 'Sekunde')) {
+ $interval = new DateInterval('PT' . $quantity . 'S');
+ } elseif (str_contains($timeAgo, 'Minute')) {
+ $interval = new DateInterval('PT' . $quantity . 'M' . $dateArray[5] . 'S');
+ } elseif (str_contains($timeAgo, 'Stunde')) {
+ $interval = new DateInterval('PT' . $quantity . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
+ } elseif (str_contains($timeAgo, 'Tag')) {
+ $interval = new DateInterval('P' . $quantity . 'DT' . $dateArray[3] . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
+ } else {
+ throw new UnexpectedValueException($timeAgo);
+ }
+ $dateTime = $dateTime->sub($interval);
+ return $dateTime->getTimestamp();
+ }
+
+ /**
+ * The main loop which clicks through search result pages and puts
+ * the content into the $items array.
+ *
+ * @throws Facebook\WebDriver\Exception\NoSuchElementException
+ * @throws Facebook\WebDriver\Exception\TimeoutException
+ */
+ public function collectData()
+ {
+ parent::collectData();
+
+ try {
+ $this->clickAwayCookieBanner();
+ $this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));
+
+ while (true) {
+ $items = $this->getDriver()->findElements(WebDriverBy::tagName('app-project-view'));
+ foreach ($items as $item) {
+ $feedItem = new FeedItem();
+
+ $heading = $item->findElement(WebDriverBy::xpath('.//app-heading-tag/h1/a'));
+ $feedItem->setTitle($heading->getText());
+ $feedItem->setURI('https://www.gulp.de' . $heading->getAttribute('href'));
+ $info = $item->findElement(WebDriverBy::tagName('app-icon-info-list'));
+ if ($logo = $this->getLogo($item)) {
+ $feedItem->setEnclosures([$logo]);
+ }
+ if (str_contains($info->getText(), 'Projektanbieter:')) {
+ $feedItem->setAuthor($info->findElement(WebDriverBy::xpath('.//li/span[2]/span'))->getText());
+ } else {
+ // mostly "Direkt vom Auftraggeber" or "GULP Agentur"
+ $feedItem->setAuthor($item->findElement(WebDriverBy::tagName('b'))->getText());
+ }
+ $feedItem->setContent($item->findElement(WebDriverBy::xpath('.//p[@class="description"]'))->getText());
+ $timeAgo = $item->findElement(WebDriverBy::xpath('.//small[contains(@class, "time-ago")]'))->getText();
+ $feedItem->setTimestamp($this->getTimestamp($timeAgo));
+
+ $this->items[] = $feedItem;
+ }
+
+ if (count($this->items) < self::MAXITEMS) {
+ $this->clickNextPage();
+ } else {
+ break;
+ }
+ }
+ } finally {
+ $this->cleanUp();
+ }
+ }
+}
diff --git a/bridges/ScalableCapitalBlogBridge.php b/bridges/ScalableCapitalBlogBridge.php
new file mode 100644
index 00000000..6f95efb3
--- /dev/null
+++ b/bridges/ScalableCapitalBlogBridge.php
@@ -0,0 +1,73 @@
+<?php
+
+use Facebook\WebDriver\WebDriverBy;
+use Facebook\WebDriver\WebDriverExpectedCondition;
+
+class ScalableCapitalBlogBridge extends WebDriverAbstract
+{
+ const NAME = 'Scalable Capital Blog';
+ const URI = 'https://de.scalable.capital/blog';
+ const DESCRIPTION = 'Alle Artikel';
+ const MAINTAINER = 'hleskien';
+
+ /**
+ * Adds accept language german to the Chrome Options.
+ *
+ * @return Facebook\WebDriver\Chrome\ChromeOptions
+ */
+ protected function getBrowserOptions()
+ {
+ $chromeOptions = parent::getBrowserOptions();
+ $chromeOptions->addArguments(['--accept-lang=de']);
+ return $chromeOptions;
+ }
+
+ /**
+ * Puts the content of the first page into the $items array.
+ *
+ * @throws Facebook\WebDriver\Exception\NoSuchElementException
+ * @throws Facebook\WebDriver\Exception\TimeoutException
+ */
+ public function collectData()
+ {
+ parent::collectData();
+
+ try {
+ // wait until last item is loaded
+ $this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(
+ WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")][15]')
+ ));
+ $this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));
+
+ $items = $this->getDriver()->findElements(WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")]'));
+ foreach ($items as $item) {
+ $feedItem = new FeedItem();
+
+ $feedItem->setEnclosures(['https://de.scalable.capital' . $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src')]);
+ $heading = $item->findElement(WebDriverBy::tagName('a'));
+ $feedItem->setTitle($heading->getText());
+ $feedItem->setURI('https://de.scalable.capital' . $heading->getAttribute('href'));
+ $feedItem->setContent($item->findElement(WebDriverBy::xpath('.//div[@class="summary"]'))->getText());
+ $date = $item->findElement(WebDriverBy::xpath('.//div[@class="published-date"]'))->getText();
+ $feedItem->setTimestamp($this->formatItemTimestamp($date));
+ $feedItem->setAuthor($item->findElement(WebDriverBy::xpath('.//div[@class="author"]'))->getText());
+
+ $this->items[] = $feedItem;
+ }
+ } finally {
+ $this->cleanUp();
+ }
+ }
+
+ /**
+ * Converts the given date (dd.mm.yyyy) into a timestamp.
+ *
+ * @param $value string
+ * @return int
+ */
+ protected function formatItemTimestamp($value)
+ {
+ $formatter = new IntlDateFormatter('de', IntlDateFormatter::LONG, IntlDateFormatter::NONE);
+ return $formatter->parse($value);
+ }
+} \ No newline at end of file
diff --git a/config.default.ini.php b/config.default.ini.php
index 7729afcb..8f7de832 100644
--- a/config.default.ini.php
+++ b/config.default.ini.php
@@ -99,6 +99,16 @@ name = "Hidden proxy name"
; false = disabled (default)
by_bridge = false
+[webdriver]
+
+; Sets the url of the webdriver or selenium server
+selenium_server_url = "http://localhost:4444"
+
+; Sets whether the browser should run in headless mode (no visible ui)
+; true = enabled
+; false = disabled (default)
+headless = false
+
[authentication]
; HTTP basic authentication
diff --git a/docs/05_Bridge_API/04_WebDriverAbstract.md b/docs/05_Bridge_API/04_WebDriverAbstract.md
new file mode 100644
index 00000000..60b5e99d
--- /dev/null
+++ b/docs/05_Bridge_API/04_WebDriverAbstract.md
@@ -0,0 +1,83 @@
+`WebDriverAbstract` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functionality for generating feeds
+from active websites that use XMLHttpRequest (XHR) to load content and / or JavaScript to
+modify content.
+It highly depends on the php-webdriver library which offers Selenium WebDriver bindings for PHP.
+
+- https://github.com/php-webdriver/php-webdriver (Project Repository)
+- https://php-webdriver.github.io/php-webdriver/latest/ (API)
+
+Please note that this class is intended as a solution for websites _that cannot be covered
+by the other classes_. The WebDriver starts a browser and is therefore very resource-intensive.
+
+# Configuration
+
+You need a running WebDriver to use bridges that depend on `WebDriverAbstract`.
+The easiest way is to start the Selenium server from the project of the same name:
+```
+docker run -d -p 4444:4444 --shm-size="2g" docker.io/selenium/standalone-chrome:latest
+```
+
+- https://github.com/SeleniumHQ/docker-selenium
+
+With these parameters only one browser window can be started at a time.
+On a multi-user site, Selenium Grid should be used
+and the number of sessions should be adjusted to the number of processor cores.
+
+Finally, the `config.ini.php` file must be adjusted so that the WebDriver
+can find the Selenium server:
+```
+[webdriver]
+
+selenium_server_url = "http://localhost:4444"
+```
+
+# Development
+
+While you are programming a new bridge, it is easier to start a local WebDriver because then you can see what is happening and where the errors are. I've also had good experience recording the process with a screen video to find any timing problems.
+
+```
+chromedriver --port=4444
+```
+
+- https://chromedriver.chromium.org/
+
+If you start rss-bridge from a container, then Chrome driver is only accessible
+if you call it with the `--allowed-ips` option so that it binds to all network interfaces.
+
+```
+chromedriver --port=4444 --allowed-ips=192.168.1.42
+```
+
+The **most important rule** is that after an event such as loading the web page
+or pressing a button, you often have to explicitly wait for the desired elements to appear.
+
+A simple example is the bridge `ScalableCapitalBlogBridge.php`.
+A more complex and relatively complete example is the bridge `GULPProjekteBridge.php`.
+
+# Template
+
+Use this template to create your own bridge.
+
+```PHP
+<?php
+
+class MyBridge extends WebDriverAbstract
+{
+ const NAME = 'My Bridge';
+ const URI = 'https://www.example.org';
+ const DESCRIPTION = 'Further description';
+ const MAINTAINER = 'your name';
+
+ public function collectData()
+ {
+ parent::collectData();
+
+ try {
+ // TODO
+ } finally {
+ $this->cleanUp();
+ }
+ }
+}
+
+``` \ No newline at end of file
diff --git a/docs/05_Bridge_API/04_XPathAbstract.md b/docs/05_Bridge_API/05_XPathAbstract.md
index fd697995..fd697995 100644
--- a/docs/05_Bridge_API/04_XPathAbstract.md
+++ b/docs/05_Bridge_API/05_XPathAbstract.md
diff --git a/docs/05_Bridge_API/index.md b/docs/05_Bridge_API/index.md
index 06445246..ea6fd315 100644
--- a/docs/05_Bridge_API/index.md
+++ b/docs/05_Bridge_API/index.md
@@ -8,6 +8,7 @@ Base class | Description
-----------|------------
[`BridgeAbstract`](./02_BridgeAbstract.md) | This class is intended for standard _Bridges_ that need to filter HTML pages for content.
[`FeedExpander`](./03_FeedExpander.md) | Expand/modify existing feed urls
-[`XPathAbstract`](./04_XPathAbstract.md) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
+[`WebDriverAbstract`](./04_WebDriverAbstract) |
+[`XPathAbstract`](./05_XPathAbstract) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
For more information about how to create a new _Bridge_, read [How to create a new Bridge?](./01_How_to_create_a_new_bridge.md) \ No newline at end of file
diff --git a/lib/WebDriverAbstract.php b/lib/WebDriverAbstract.php
new file mode 100644
index 00000000..db2fb7b1
--- /dev/null
+++ b/lib/WebDriverAbstract.php
@@ -0,0 +1,141 @@
+<?php
+
+use Facebook\WebDriver\Chrome\ChromeOptions;
+use Facebook\WebDriver\Remote\DesiredCapabilities;
+use Facebook\WebDriver\Remote\RemoteWebDriver;
+use Facebook\WebDriver\WebDriverCapabilities;
+
+/**
+ * An alternative abstract class for bridges depending on webdriver
+ *
+ * This class is meant a solution for active websites that use
+ * XMLHttpRequest (XHR) to load content and/or use JavaScript to
+ * change content. This class depends on a working webdriver setup.
+ */
+abstract class WebDriverAbstract extends BridgeAbstract
+{
+ /**
+ * Holds the remote webdriver object, including configuration and
+ * connection.
+ *
+ * @var RemoteWebDriver
+ */
+ protected RemoteWebDriver $driver;
+
+ /**
+ * Holds the uri of the feed's icon.
+ *
+ * @var string | null
+ */
+ private $feedIcon;
+
+ /**
+ * Returns the webdriver object.
+ *
+ * @return RemoteWebDriver
+ */
+ protected function getDriver(): RemoteWebDriver
+ {
+ return $this->driver;
+ }
+
+ /**
+ * Returns the uri of the feed's icon.
+ *
+ * @return string
+ */
+ public function getIcon()
+ {
+ return $this->feedIcon ?: parent::getIcon();
+ }
+
+ /**
+ * Sets the uri of the feed's icon.
+ *
+ * @param $iconurl string
+ */
+ protected function setIcon($iconurl)
+ {
+ $this->feedIcon = $iconurl;
+ }
+
+ /**
+ * Returns the ChromeOptions object.
+ *
+ * If the configuration parameter 'headless' is set to true, the
+ * argument '--headless' is added. Override this to change or add
+ * more options.
+ *
+ * @return ChromeOptions
+ */
+ protected function getBrowserOptions()
+ {
+ $chromeOptions = new ChromeOptions();
+ if (Configuration::getConfig('webdriver', 'headless')) {
+ $chromeOptions->addArguments(['--headless']); // --window-size=1024,1024
+ }
+ return $chromeOptions;
+ }
+
+ /**
+ * Returns the DesiredCapabilities object for the Chrome browser.
+ *
+ * The Chrome options are added. Override this to change or add
+ * more capabilities.
+ *
+ * @return WebDriverCapabilities
+ */
+ protected function getDesiredCapabilities(): WebDriverCapabilities
+ {
+ $desiredCapabilities = DesiredCapabilities::chrome();
+ $desiredCapabilities->setCapability(ChromeOptions::CAPABILITY, $this->getBrowserOptions());
+ return $desiredCapabilities;
+ }
+
+ /**
+ * Constructs the remote webdriver with the url of the remote (Selenium)
+ * webdriver server and the desired capabilities.
+ *
+ * This should be called in collectData() first.
+ */
+ protected function prepareWebDriver()
+ {
+ $server = Configuration::getConfig('webdriver', 'selenium_server_url');
+ $this->driver = RemoteWebDriver::create($server, $this->getDesiredCapabilities());
+ }
+
+ /**
+ * Maximizes the remote browser window (often important for reactive sites
+ * which change their appearance depending on the window size) and opens
+ * the uri set in the constant URI.
+ */
+ protected function prepareWindow()
+ {
+ $this->getDriver()->manage()->window()->maximize();
+ $this->getDriver()->get($this->getURI());
+ }
+
+ /**
+ * Closes the remote browser window and shuts down the remote webdriver
+ * connection.
+ *
+ * This must be called at the end of scraping, for example within a
+ * 'finally' block.
+ */
+ protected function cleanUp()
+ {
+ $this->getDriver()->quit();
+ }
+
+ /**
+ * Do your web scraping here and fill the $items array.
+ *
+ * Override this but call parent() first.
+ * Don't forget to call cleanUp() at the end.
+ */
+ public function collectData()
+ {
+ $this->prepareWebDriver();
+ $this->prepareWindow();
+ }
+} \ No newline at end of file