From 8e8028b7860ebb09eae92dcd43b4b6916d26d4d6 Mon Sep 17 00:00:00 2001 From: hleskien <34342248+hleskien@users.noreply.github.com> Date: Sat, 10 Feb 2024 04:42:22 +0100 Subject: Adopt WebDriverAbstract as a solution for active (JavaScript) websites (#3971) * first working version --------- Co-authored-by: Dag --- lib/WebDriverAbstract.php | 141 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 lib/WebDriverAbstract.php (limited to 'lib') diff --git a/lib/WebDriverAbstract.php b/lib/WebDriverAbstract.php new file mode 100644 index 00000000..db2fb7b1 --- /dev/null +++ b/lib/WebDriverAbstract.php @@ -0,0 +1,141 @@ +driver; + } + + /** + * Returns the uri of the feed's icon. + * + * @return string + */ + public function getIcon() + { + return $this->feedIcon ?: parent::getIcon(); + } + + /** + * Sets the uri of the feed's icon. + * + * @param $iconurl string + */ + protected function setIcon($iconurl) + { + $this->feedIcon = $iconurl; + } + + /** + * Returns the ChromeOptions object. + * + * If the configuration parameter 'headless' is set to true, the + * argument '--headless' is added. Override this to change or add + * more options. + * + * @return ChromeOptions + */ + protected function getBrowserOptions() + { + $chromeOptions = new ChromeOptions(); + if (Configuration::getConfig('webdriver', 'headless')) { + $chromeOptions->addArguments(['--headless']); // --window-size=1024,1024 + } + return $chromeOptions; + } + + /** + * Returns the DesiredCapabilities object for the Chrome browser. + * + * The Chrome options are added. Override this to change or add + * more capabilities. + * + * @return WebDriverCapabilities + */ + protected function getDesiredCapabilities(): WebDriverCapabilities + { + $desiredCapabilities = DesiredCapabilities::chrome(); + $desiredCapabilities->setCapability(ChromeOptions::CAPABILITY, $this->getBrowserOptions()); + return $desiredCapabilities; + } + + /** + * Constructs the remote webdriver with the url of the remote (Selenium) + * webdriver server and the desired capabilities. + * + * This should be called in collectData() first. + */ + protected function prepareWebDriver() + { + $server = Configuration::getConfig('webdriver', 'selenium_server_url'); + $this->driver = RemoteWebDriver::create($server, $this->getDesiredCapabilities()); + } + + /** + * Maximizes the remote browser window (often important for reactive sites + * which change their appearance depending on the window size) and opens + * the uri set in the constant URI. + */ + protected function prepareWindow() + { + $this->getDriver()->manage()->window()->maximize(); + $this->getDriver()->get($this->getURI()); + } + + /** + * Closes the remote browser window and shuts down the remote webdriver + * connection. + * + * This must be called at the end of scraping, for example within a + * 'finally' block. + */ + protected function cleanUp() + { + $this->getDriver()->quit(); + } + + /** + * Do your web scraping here and fill the $items array. + * + * Override this but call parent() first. + * Don't forget to call cleanUp() at the end. + */ + public function collectData() + { + $this->prepareWebDriver(); + $this->prepareWindow(); + } +} \ No newline at end of file -- cgit v1.2.3