[ 'name' => 'Domain to use', 'required' => true, 'defaultValue' => self::DEFAULT_DOMAIN ], 'page' => [ 'name' => 'Initial page to load', 'required' => true, 'exampleValue' => 'sexe/news' ], 'limit' => self::LIMIT, ]]; const REPLACED_ATTRIBUTES = [ 'href' => 'href', 'src' => 'src', 'data-original' => 'src' ]; const POSSIBLE_TITLES = [ 'h2', 'h3' ]; private function getDomain() { $domain = $this->getInput('domain'); if (empty($domain)) { $domain = self::DEFAULT_DOMAIN; } if (strpos($domain, '://') === false) { $domain = 'https://' . $domain; } return $domain; } public function getURI() { return $this->getDomain() . '/' . $this->getInput('page'); } private function findTitleOf($link) { foreach (self::POSSIBLE_TITLES as $tag) { $title = $link->parent()->find($tag, 0); if ($title !== null) { if ($title->plaintext !== null) { return $title->plaintext; } } } } public function collectData() { $html = getSimpleHTMLDOM($this->getURI()); // Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content ! $main = $html->find('main', 0); $limit = $this->getInput('limit') ?? 10; foreach ($main->find('a') as $link) { if (count($this->items) >= $limit) { break; } $uri = $link->href; $date = $link->parent()->find('time', 0); $item = []; $author = $link->parent()->find('span[itemprop=name]', 0); if ($author !== null) { $item['author'] = $author->plaintext; $item['title'] = $this->findTitleOf($link); switch (substr($uri, 0, 1)) { case 'h': // absolute uri $item['uri'] = $uri; break; case '/': // domain relative uri $item['uri'] = $this->getDomain() . $uri; break; default: $item['uri'] = $this->getDomain() . '/' . $uri; } $article = $this->loadFullArticle($item['uri']); if ($article) { $item['content'] = $this->replaceUriInHtmlElement($article); } else { $item['content'] = "Article body couldn't be loaded. It must be a bug!"; } $short_date = $date->datetime; $item['timestamp'] = strtotime($short_date); $this->items[] = $item; } } } /** * Loads the full article and returns the contents * @param $uri The article URI * @return The article content */ private function loadFullArticle($uri) { $html = getSimpleHTMLDOMCached($uri); return $html->find('article', 0); } /** * Replaces all relative URIs with absolute ones * @param $element A simplehtmldom element * @return The $element->innertext with all URIs replaced */ private function replaceUriInHtmlElement($element) { $returned = $element->innertext; foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); } return $returned; } } ort-file-path Unnamed repository; edit this file 'description' to name the repository.
aboutsummaryrefslogtreecommitdiff
path: root/src/string_immutable.zig (unfollow)
AgeCommit message (Expand)AuthorFilesLines
2022-03-02reduce number of global variablesGravatar Jarred Sumner 3-21/+1752
2022-03-02Update bindings.zigGravatar Jarred Sumner 1-0/+4
2022-03-02Remove function from bindingsGravatar Jarred Sumner 1-7/+1
2022-03-02`DELETE` headerGravatar Jarred Sumner 1-0/+4
2022-03-02[bun.js] fix unicode handling in RouterGravatar Jarred Sumner 1-3/+9
2022-03-02[bun.js] Fix crash due to incorrectly creating stringGravatar Jarred Sumner 1-3/+1
2022-03-02remove a threadlocalGravatar Jarred Sumner 2-12/+908
2022-03-02cleanup error printingGravatar Jarred Sumner 2-2/+5
2022-03-02Update global.zigGravatar Jarred Sumner 1-0/+17
2022-03-02Update fs.zigGravatar Jarred Sumner 1-0/+3
2022-03-02[bun run] Set more environment variablesGravatar Jarred Sumner 2-0/+55
2022-03-02clean up error message when CLI flag is invalidGravatar Jarred Sumner 1-1/+4
2022-03-02add `bun pm cache` and `bun pm cache rm` commandsGravatar Jarred Sumner 1-0/+20
2022-03-01[bun.js] `ResolveError.prototype.toString()` `BuildError.prototype.toString()`Gravatar Jarred Sumner 1-2/+94
2022-03-01add `allowBunRuntime` and `autoImportJSX` flags to Bun.TranspilerGravatar Jarred Sumner 1-1/+32
2022-03-01cleanup code that checks if it should send an HTTP bodyGravatar Jarred Sumner 2-6/+18
2022-03-01[JS Parser] Fix bug with `super` from adding class static blocksGravatar Jarred Sumner 1-1/+12
2022-03-01Update bundler.zigGravatar Jarred Sumner 1-1/+5
2022-03-01Remove unused boolGravatar Jarred Sumner 1-3/+0
2022-03-01[bun.js] Allow disabling runtime imports so bun can build for nodeGravatar Jarred Sumner 3-5/+11
2022-03-01[JS Parser] Make auto importing JSX a flag so the API is easierGravatar Jarred Sumner 1-295/+297
2022-03-01cleanupGravatar Jarred Sumner 1-2/+2
2022-03-01Update javascript.zigGravatar Jarred Sumner 1-50/+0
2022-03-01[bun.js] shim async fsGravatar Jarred Sumner 2-36/+226
2022-03-01[bun.js] Implement `setTimeout`, `setInterval`, `clearTimeout`, `clearInterval`Gravatar Jarred Sumner 11-41/+295
2022-02-27Update transpiler.test.jsGravatar Jarred Sumner 1-0/+8
2022-02-27[TS] Make `export {type foo}` output consistent with TS parserGravatar Jarred Sumner 1-3/+45
2022-02-27WASMGravatar Jarred Sumner 83-690/+10789