aboutsummaryrefslogtreecommitdiff
path: root/bridges/InternetArchiveBridge.php
diff options
context:
space:
mode:
Diffstat (limited to 'bridges/InternetArchiveBridge.php')
-rw-r--r--bridges/InternetArchiveBridge.php519
1 files changed, 263 insertions, 256 deletions
diff --git a/bridges/InternetArchiveBridge.php b/bridges/InternetArchiveBridge.php
index b9f9d274..7175cde8 100644
--- a/bridges/InternetArchiveBridge.php
+++ b/bridges/InternetArchiveBridge.php
@@ -1,319 +1,326 @@
<?php
-class InternetArchiveBridge extends BridgeAbstract {
- const NAME = 'Internet Archive Bridge';
- const URI = 'https://archive.org';
- const DESCRIPTION = 'Returns newest uploads, posts and more from an account';
- const MAINTAINER = 'VerifiedJoseph';
- const PARAMETERS = array(
- 'Account' => array(
- 'username' => array(
- 'name' => 'Username',
- 'type' => 'text',
- 'required' => true,
- 'exampleValue' => '@verifiedjoseph',
- ),
- 'content' => array(
- 'name' => 'Content',
- 'type' => 'list',
- 'values' => array(
- 'Uploads' => 'uploads',
- 'Posts' => 'posts',
- 'Reviews' => 'reviews',
- 'Collections' => 'collections',
- 'Web Archives' => 'web-archive',
- ),
- 'defaultValue' => 'uploads',
- ),
- 'limit' => self::LIMIT,
- )
- );
-
- const CACHE_TIMEOUT = 900; // 15 mins
-
- const TEST_DETECT_PARAMETERS = array(
- 'https://archive.org/details/@verifiedjoseph' => array(
- 'context' => 'Account', 'username' => 'verifiedjoseph', 'content' => 'uploads'
- ),
- 'https://archive.org/details/@verifiedjoseph?tab=collections' => array(
- 'context' => 'Account', 'username' => 'verifiedjoseph', 'content' => 'collections'
- ),
- );
-
- private $skipClasses = array(
- 'item-ia mobile-header hidden-tiles',
- 'item-ia account-ia'
- );
-
- private $detectParamsRegex = '/https?:\/\/archive\.org\/details\/@([\w]+)(?:\?tab=([a-z-]+))?/';
-
- public function detectParameters($url) {
- $params = array();
-
- if(preg_match($this->detectParamsRegex, $url, $matches) > 0) {
- $params['context'] = 'Account';
- $params['username'] = $matches[1];
- $params['content'] = 'uploads';
-
- if (isset($matches[2])) {
- $params['content'] = $matches[2];
- }
-
- return $params;
- }
-
- return null;
- }
-
- public function collectData() {
-
- $html = getSimpleHTMLDOM($this->getURI());
-
- $html = defaultLinkTo($html, $this->getURI());
-
- if ($this->getInput('content') !== 'posts') {
- $detailsDivNumber = 0;
-
- $results = $html->find('div.results > div[data-id]');
- foreach ($results as $index => $result) {
- $item = array();
-
- if (in_array($result->class, $this->skipClasses)) {
- continue;
- }
-
- switch($result->class) {
- case 'item-ia':
- switch($this->getInput('content')) {
- case 'reviews':
- $item = $this->processReview($result);
- break;
- case 'uploads':
- $item = $this->processUpload($result);
- break;
- }
-
- break;
- case 'item-ia url-item':
- $item = $this->processWebArchives($result);
- break;
- case 'item-ia collection-ia':
- $item = $this->processCollection($result);
- break;
- }
-
- if ($this->getInput('content') !== 'reviews') {
- $hiddenDetails = $this->processHiddenDetails($html, $detailsDivNumber, $item);
-
- $this->items[] = array_merge($item, $hiddenDetails);
- } else {
-
- $this->items[] = $item;
-
- }
-
- $detailsDivNumber++;
- $limit = $this->getInput('limit') ?? 10;
- if (count($this->items) >= $limit) {
- break;
- }
- }
- }
-
- if ($this->getInput('content') === 'posts') {
- $this->items = $this->processPosts($html);
- }
- }
-
- public function getURI() {
-
- if (!is_null($this->getInput('username')) && !is_null($this->getInput('content'))) {
- return self::URI . '/details/' . $this->processUsername() . '&tab=' . $this->getInput('content');
- }
-
- return parent::getURI();
- }
-
- public function getName() {
-
- if (!is_null($this->getInput('username')) && !is_null($this->getInput('content'))) {
- $contentValues = array_flip(self::PARAMETERS['Account']['content']['values']);
-
- return $contentValues[$this->getInput('content')] . ' - '
- . $this->processUsername() . ' - Internet Archive';
- }
-
- return parent::getName();
- }
-
- private function processUsername() {
-
- if (substr($this->getInput('username'), 0, 1) !== '@') {
- return '@' . $this->getInput('username');
- }
-
- return $this->getInput('username');
- }
-
- private function processUpload($result) {
- $item = array();
-
- $collection = $result->find('a.stealth', 0);
- $collectionLink = $collection->href;
- $collectionTitle = $collection->find('div.item-parent-ttl', 0)->plaintext;
-
- $item['title'] = trim($result->find('div.ttl', 0)->innertext);
- $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
- $item['uri'] = $result->find('div.item-ttl.C.C2 > a', 0)->href;
-
- if ($result->find('div.by.C.C4', 0)->children(2)) {
- $item['author'] = $result->find('div.by.C.C4', 0)->children(2)->plaintext;
- }
-
- $item['content'] = <<<EOD
+class InternetArchiveBridge extends BridgeAbstract
+{
+ const NAME = 'Internet Archive Bridge';
+ const URI = 'https://archive.org';
+ const DESCRIPTION = 'Returns newest uploads, posts and more from an account';
+ const MAINTAINER = 'VerifiedJoseph';
+ const PARAMETERS = [
+ 'Account' => [
+ 'username' => [
+ 'name' => 'Username',
+ 'type' => 'text',
+ 'required' => true,
+ 'exampleValue' => '@verifiedjoseph',
+ ],
+ 'content' => [
+ 'name' => 'Content',
+ 'type' => 'list',
+ 'values' => [
+ 'Uploads' => 'uploads',
+ 'Posts' => 'posts',
+ 'Reviews' => 'reviews',
+ 'Collections' => 'collections',
+ 'Web Archives' => 'web-archive',
+ ],
+ 'defaultValue' => 'uploads',
+ ],
+ 'limit' => self::LIMIT,
+ ]
+ ];
+
+ const CACHE_TIMEOUT = 900; // 15 mins
+
+ const TEST_DETECT_PARAMETERS = [
+ 'https://archive.org/details/@verifiedjoseph' => [
+ 'context' => 'Account', 'username' => 'verifiedjoseph', 'content' => 'uploads'
+ ],
+ 'https://archive.org/details/@verifiedjoseph?tab=collections' => [
+ 'context' => 'Account', 'username' => 'verifiedjoseph', 'content' => 'collections'
+ ],
+ ];
+
+ private $skipClasses = [
+ 'item-ia mobile-header hidden-tiles',
+ 'item-ia account-ia'
+ ];
+
+ private $detectParamsRegex = '/https?:\/\/archive\.org\/details\/@([\w]+)(?:\?tab=([a-z-]+))?/';
+
+ public function detectParameters($url)
+ {
+ $params = [];
+
+ if (preg_match($this->detectParamsRegex, $url, $matches) > 0) {
+ $params['context'] = 'Account';
+ $params['username'] = $matches[1];
+ $params['content'] = 'uploads';
+
+ if (isset($matches[2])) {
+ $params['content'] = $matches[2];
+ }
+
+ return $params;
+ }
+
+ return null;
+ }
+
+ public function collectData()
+ {
+ $html = getSimpleHTMLDOM($this->getURI());
+
+ $html = defaultLinkTo($html, $this->getURI());
+
+ if ($this->getInput('content') !== 'posts') {
+ $detailsDivNumber = 0;
+
+ $results = $html->find('div.results > div[data-id]');
+ foreach ($results as $index => $result) {
+ $item = [];
+
+ if (in_array($result->class, $this->skipClasses)) {
+ continue;
+ }
+
+ switch ($result->class) {
+ case 'item-ia':
+ switch ($this->getInput('content')) {
+ case 'reviews':
+ $item = $this->processReview($result);
+ break;
+ case 'uploads':
+ $item = $this->processUpload($result);
+ break;
+ }
+
+ break;
+ case 'item-ia url-item':
+ $item = $this->processWebArchives($result);
+ break;
+ case 'item-ia collection-ia':
+ $item = $this->processCollection($result);
+ break;
+ }
+
+ if ($this->getInput('content') !== 'reviews') {
+ $hiddenDetails = $this->processHiddenDetails($html, $detailsDivNumber, $item);
+
+ $this->items[] = array_merge($item, $hiddenDetails);
+ } else {
+ $this->items[] = $item;
+ }
+
+ $detailsDivNumber++;
+
+ $limit = $this->getInput('limit') ?? 10;
+ if (count($this->items) >= $limit) {
+ break;
+ }
+ }
+ }
+
+ if ($this->getInput('content') === 'posts') {
+ $this->items = $this->processPosts($html);
+ }
+ }
+
+ public function getURI()
+ {
+ if (!is_null($this->getInput('username')) && !is_null($this->getInput('content'))) {
+ return self::URI . '/details/' . $this->processUsername() . '&tab=' . $this->getInput('content');
+ }
+
+ return parent::getURI();
+ }
+
+ public function getName()
+ {
+ if (!is_null($this->getInput('username')) && !is_null($this->getInput('content'))) {
+ $contentValues = array_flip(self::PARAMETERS['Account']['content']['values']);
+
+ return $contentValues[$this->getInput('content')] . ' - '
+ . $this->processUsername() . ' - Internet Archive';
+ }
+
+ return parent::getName();
+ }
+
+ private function processUsername()
+ {
+ if (substr($this->getInput('username'), 0, 1) !== '@') {
+ return '@' . $this->getInput('username');
+ }
+
+ return $this->getInput('username');
+ }
+
+ private function processUpload($result)
+ {
+ $item = [];
+
+ $collection = $result->find('a.stealth', 0);
+ $collectionLink = $collection->href;
+ $collectionTitle = $collection->find('div.item-parent-ttl', 0)->plaintext;
+
+ $item['title'] = trim($result->find('div.ttl', 0)->innertext);
+ $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.item-ttl.C.C2 > a', 0)->href;
+
+ if ($result->find('div.by.C.C4', 0)->children(2)) {
+ $item['author'] = $result->find('div.by.C.C4', 0)->children(2)->plaintext;
+ }
+
+ $item['content'] = <<<EOD
<p>Media Type: {$result->attr['data-mediatype']}<br>
Collection: <a href="{$collectionLink}">{$collectionTitle}</a></p>
EOD;
- $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
+ $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
- return $item;
- }
+ return $item;
+ }
- private function processReview($result) {
- $item = array();
+ private function processReview($result)
+ {
+ $item = [];
- $item['title'] = trim($result->find('div.ttl', 0)->innertext);
- $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
- $item['uri'] = $result->find('div.review-title', 0)->children(0)->href;
+ $item['title'] = trim($result->find('div.ttl', 0)->innertext);
+ $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.review-title', 0)->children(0)->href;
- if ($result->find('div.by.C.C4', 0)->children(2)) {
- $item['author'] = $result->find('div.by.C.C4', 0)->children(2)->plaintext;
- }
+ if ($result->find('div.by.C.C4', 0)->children(2)) {
+ $item['author'] = $result->find('div.by.C.C4', 0)->children(2)->plaintext;
+ }
- $item['content'] = <<<EOD
+ $item['content'] = <<<EOD
<p><strong>Subject: {$result->find('div.review-title', 0)->plaintext}</strong></p>
<p>{$result->find('div.hidden-lists.review' , 0)->children(1)->plaintext}</p>
EOD;
- $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
+ $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
- return $item;
- }
+ return $item;
+ }
- private function processWebArchives($result) {
- $item = array();
+ private function processWebArchives($result)
+ {
+ $item = [];
- $item['title'] = trim($result->find('div.ttl', 0)->plaintext);
- $item['timestamp'] = strtotime($result->find('div.hidden-lists', 0)->children(0)->plaintext);
- $item['uri'] = $result->find('div.item-ttl.C.C2 > a', 0)->href;
+ $item['title'] = trim($result->find('div.ttl', 0)->plaintext);
+ $item['timestamp'] = strtotime($result->find('div.hidden-lists', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.item-ttl.C.C2 > a', 0)->href;
- $item['content'] = <<<EOD
+ $item['content'] = <<<EOD
{$this->processUsername()} archived <a href="{$item['uri']}">{$result->find('div.ttl', 0)->plaintext}</a>
EOD;
- $item['enclosures'][] = $result->find('img.item-img', 0)->source;
+ $item['enclosures'][] = $result->find('img.item-img', 0)->source;
- return $item;
- }
+ return $item;
+ }
- private function processCollection($result) {
- $item = array();
+ private function processCollection($result)
+ {
+ $item = [];
- $title = trim($result->find('div.collection-title.C.C2', 0)->children(0)->plaintext);
- $itemCount = strtolower(trim($result->find('div.num-items.topinblock', 0)->plaintext));
+ $title = trim($result->find('div.collection-title.C.C2', 0)->children(0)->plaintext);
+ $itemCount = strtolower(trim($result->find('div.num-items.topinblock', 0)->plaintext));
- $item['title'] = $title . ' (' . $itemCount . ')';
- $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
- $item['uri'] = $result->find('div.collection-title.C.C2 > a', 0)->href;
+ $item['title'] = $title . ' (' . $itemCount . ')';
+ $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.collection-title.C.C2 > a', 0)->href;
- $item['content'] = '';
+ $item['content'] = '';
- if ($result->find('img.item-img', 0)) {
- $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
- }
+ if ($result->find('img.item-img', 0)) {
+ $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
+ }
- return $item;
- }
+ return $item;
+ }
- private function processHiddenDetails($html, $detailsDivNumber, $item) {
- $description = '';
+ private function processHiddenDetails($html, $detailsDivNumber, $item)
+ {
+ $description = '';
- if ($html->find('div.details-ia.hidden-tiles', $detailsDivNumber)) {
- $detailsDiv = $html->find('div.details-ia.hidden-tiles', $detailsDivNumber);
+ if ($html->find('div.details-ia.hidden-tiles', $detailsDivNumber)) {
+ $detailsDiv = $html->find('div.details-ia.hidden-tiles', $detailsDivNumber);
- if ($detailsDiv->find('div.C234', 0)->children(0)) {
- $description = $detailsDiv->find('div.C234', 0)->children(0)->plaintext;
+ if ($detailsDiv->find('div.C234', 0)->children(0)) {
+ $description = $detailsDiv->find('div.C234', 0)->children(0)->plaintext;
- $detailsDiv->find('div.C234', 0)->children(0)->innertext = '';
- }
+ $detailsDiv->find('div.C234', 0)->children(0)->innertext = '';
+ }
- $topics = trim($detailsDiv->find('div.C234', 0)->plaintext);
+ $topics = trim($detailsDiv->find('div.C234', 0)->plaintext);
- if (!empty($topics)) {
- $topics = trim($detailsDiv->find('div.C234', 0)->plaintext);
- $topics = trim(substr($topics, 7));
+ if (!empty($topics)) {
+ $topics = trim($detailsDiv->find('div.C234', 0)->plaintext);
+ $topics = trim(substr($topics, 7));
- $item['categories'] = explode(',', $topics);
- }
+ $item['categories'] = explode(',', $topics);
+ }
- $item['content'] = '<p>' . $description . '</p>' . $item['content'];
- }
+ $item['content'] = '<p>' . $description . '</p>' . $item['content'];
+ }
- return $item;
- }
+ return $item;
+ }
- private function processPosts($html) {
- $items = array();
+ private function processPosts($html)
+ {
+ $items = [];
- foreach ($html->find('table.forumTable > tr') as $index => $tr) {
- $item = array();
+ foreach ($html->find('table.forumTable > tr') as $index => $tr) {
+ $item = [];
- if ($index === 0) {
- continue;
- }
+ if ($index === 0) {
+ continue;
+ }
- $item['title'] = $tr->find('td', 0)->plaintext;
- $item['timestamp'] = strtotime($tr->find('td', 4)->children(0)->plaintext);
- $item['uri'] = $tr->find('td', 0)->children(0)->href;
+ $item['title'] = $tr->find('td', 0)->plaintext;
+ $item['timestamp'] = strtotime($tr->find('td', 4)->children(0)->plaintext);
+ $item['uri'] = $tr->find('td', 0)->children(0)->href;
- $formLink = <<<EOD
+ $formLink = <<<EOD
<a href="{$tr->find('td', 2)->children(0)->href}">{$tr->find('td', 2)->children(0)->plaintext}</a>
EOD;
- $postDate = $tr->find('td', 4)->children(0)->plaintext;
+ $postDate = $tr->find('td', 4)->children(0)->plaintext;
- $postPageHtml = getSimpleHTMLDOMCached($item['uri'], 3600);
+ $postPageHtml = getSimpleHTMLDOMCached($item['uri'], 3600);
- $postPageHtml = defaultLinkTo($postPageHtml, $this->getURI());
+ $postPageHtml = defaultLinkTo($postPageHtml, $this->getURI());
- $post = $postPageHtml->find('div.box.well.well-sm', 0);
+ $post = $postPageHtml->find('div.box.well.well-sm', 0);
- $parentLink = '';
- $replyLink = <<<EOD
+ $parentLink = '';
+ $replyLink = <<<EOD
<a href="{$post->find('a', 0)->href}">Reply</a>
EOD;
- if ($post->find('a', 1)->innertext = 'See parent post') {
- $parentLink = <<<EOD
+ if ($post->find('a', 1)->innertext = 'See parent post') {
+ $parentLink = <<<EOD
<a href="{$post->find('a', 1)->href}">View parent post</a>
EOD;
- }
+ }
- $post->find('h1', 0)->outertext = '';
- $post->find('h2', 0)->outertext = '';
+ $post->find('h1', 0)->outertext = '';
+ $post->find('h2', 0)->outertext = '';
- $item['content'] = <<<EOD
+ $item['content'] = <<<EOD
<p>{$post->innertext}</p>{$replyLink} - {$parentLink} - Posted in {$formLink} on {$postDate}
EOD;
- $items[] = $item;
+ $items[] = $item;
- if (count($items) >= $this->getInput('limit') ?? 10) {
- break;
- }
- }
+ if (count($items) >= $this->getInput('limit') ?? 10) {
+ break;
+ }
+ }
- return $items;
- }
+ return $items;
+ }
}