diff options
Diffstat (limited to 'lib/contents.php')
-rw-r--r-- | lib/contents.php | 570 |
1 files changed, 292 insertions, 278 deletions
diff --git a/lib/contents.php b/lib/contents.php index cc80248b..a01d81e1 100644 --- a/lib/contents.php +++ b/lib/contents.php @@ -1,48 +1,50 @@ <?php -final class HttpException extends \Exception {} +final class HttpException extends \Exception +{ +} // todo: move this somewhere useful, possibly into a function const RSSBRIDGE_HTTP_STATUS_CODES = [ - '100' => 'Continue', - '101' => 'Switching Protocols', - '200' => 'OK', - '201' => 'Created', - '202' => 'Accepted', - '203' => 'Non-Authoritative Information', - '204' => 'No Content', - '205' => 'Reset Content', - '206' => 'Partial Content', - '300' => 'Multiple Choices', - '302' => 'Found', - '303' => 'See Other', - '304' => 'Not Modified', - '305' => 'Use Proxy', - '400' => 'Bad Request', - '401' => 'Unauthorized', - '402' => 'Payment Required', - '403' => 'Forbidden', - '404' => 'Not Found', - '405' => 'Method Not Allowed', - '406' => 'Not Acceptable', - '407' => 'Proxy Authentication Required', - '408' => 'Request Timeout', - '409' => 'Conflict', - '410' => 'Gone', - '411' => 'Length Required', - '412' => 'Precondition Failed', - '413' => 'Request Entity Too Large', - '414' => 'Request-URI Too Long', - '415' => 'Unsupported Media Type', - '416' => 'Requested Range Not Satisfiable', - '417' => 'Expectation Failed', - '429' => 'Too Many Requests', - '500' => 'Internal Server Error', - '501' => 'Not Implemented', - '502' => 'Bad Gateway', - '503' => 'Service Unavailable', - '504' => 'Gateway Timeout', - '505' => 'HTTP Version Not Supported' + '100' => 'Continue', + '101' => 'Switching Protocols', + '200' => 'OK', + '201' => 'Created', + '202' => 'Accepted', + '203' => 'Non-Authoritative Information', + '204' => 'No Content', + '205' => 'Reset Content', + '206' => 'Partial Content', + '300' => 'Multiple Choices', + '302' => 'Found', + '303' => 'See Other', + '304' => 'Not Modified', + '305' => 'Use Proxy', + '400' => 'Bad Request', + '401' => 'Unauthorized', + '402' => 'Payment Required', + '403' => 'Forbidden', + '404' => 'Not Found', + '405' => 'Method Not Allowed', + '406' => 'Not Acceptable', + '407' => 'Proxy Authentication Required', + '408' => 'Request Timeout', + '409' => 'Conflict', + '410' => 'Gone', + '411' => 'Length Required', + '412' => 'Precondition Failed', + '413' => 'Request Entity Too Large', + '414' => 'Request-URI Too Long', + '415' => 'Unsupported Media Type', + '416' => 'Requested Range Not Satisfiable', + '417' => 'Expectation Failed', + '429' => 'Too Many Requests', + '500' => 'Internal Server Error', + '501' => 'Not Implemented', + '502' => 'Bad Gateway', + '503' => 'Service Unavailable', + '504' => 'Gateway Timeout', + '505' => 'HTTP Version Not Supported' ]; /** @@ -61,70 +63,70 @@ const RSSBRIDGE_HTTP_STATUS_CODES = [ * @return string|array */ function getContents( - string $url, - array $httpHeaders = [], - array $curlOptions = [], - bool $returnFull = false + string $url, + array $httpHeaders = [], + array $curlOptions = [], + bool $returnFull = false ) { - $cacheFactory = new CacheFactory(); + $cacheFactory = new CacheFactory(); - $cache = $cacheFactory->create(Configuration::getConfig('cache', 'type')); - $cache->setScope('server'); - $cache->purgeCache(86400); // 24 hours (forced) - $cache->setKey([$url]); + $cache = $cacheFactory->create(Configuration::getConfig('cache', 'type')); + $cache->setScope('server'); + $cache->purgeCache(86400); // 24 hours (forced) + $cache->setKey([$url]); - $config = [ - 'headers' => $httpHeaders, - 'curl_options' => $curlOptions, - ]; - if (defined('PROXY_URL') && !defined('NOPROXY')) { - $config['proxy'] = PROXY_URL; - } - if(!Debug::isEnabled() && $cache->getTime()) { - $config['if_not_modified_since'] = $cache->getTime(); - } + $config = [ + 'headers' => $httpHeaders, + 'curl_options' => $curlOptions, + ]; + if (defined('PROXY_URL') && !defined('NOPROXY')) { + $config['proxy'] = PROXY_URL; + } + if (!Debug::isEnabled() && $cache->getTime()) { + $config['if_not_modified_since'] = $cache->getTime(); + } - $result = _http_request($url, $config); - $response = [ - 'code' => $result['code'], - 'status_lines' => $result['status_lines'], - 'header' => $result['headers'], - 'content' => $result['body'], - ]; + $result = _http_request($url, $config); + $response = [ + 'code' => $result['code'], + 'status_lines' => $result['status_lines'], + 'header' => $result['headers'], + 'content' => $result['body'], + ]; - switch($result['code']) { - case 200: - case 201: - case 202: - if(isset($result['headers']['cache-control'])) { - $cachecontrol = $result['headers']['cache-control']; - $lastValue = array_pop($cachecontrol); - $directives = explode(',', $lastValue); - $directives = array_map('trim', $directives); - if(in_array('no-cache', $directives) || in_array('no-store', $directives)) { - // Don't cache as instructed by the server - break; - } - } - $cache->saveData($result['body']); - break; - case 304: // Not Modified - $response['content'] = $cache->loadData(); - break; - default: - throw new HttpException( - sprintf( - '%s %s', - $result['code'], - RSSBRIDGE_HTTP_STATUS_CODES[$result['code']] ?? '' - ), - $result['code'] - ); - } - if ($returnFull === true) { - return $response; - } - return $response['content']; + switch ($result['code']) { + case 200: + case 201: + case 202: + if (isset($result['headers']['cache-control'])) { + $cachecontrol = $result['headers']['cache-control']; + $lastValue = array_pop($cachecontrol); + $directives = explode(',', $lastValue); + $directives = array_map('trim', $directives); + if (in_array('no-cache', $directives) || in_array('no-store', $directives)) { + // Don't cache as instructed by the server + break; + } + } + $cache->saveData($result['body']); + break; + case 304: // Not Modified + $response['content'] = $cache->loadData(); + break; + default: + throw new HttpException( + sprintf( + '%s %s', + $result['code'], + RSSBRIDGE_HTTP_STATUS_CODES[$result['code']] ?? '' + ), + $result['code'] + ); + } + if ($returnFull === true) { + return $response; + } + return $response['content']; } /** @@ -136,85 +138,85 @@ function getContents( */ function _http_request(string $url, array $config = []): array { - $defaults = [ - 'useragent' => Configuration::getConfig('http', 'useragent'), - 'timeout' => Configuration::getConfig('http', 'timeout'), - 'headers' => [], - 'proxy' => null, - 'curl_options' => [], - 'if_not_modified_since' => null, - 'retries' => 3, - ]; - $config = array_merge($defaults, $config); + $defaults = [ + 'useragent' => Configuration::getConfig('http', 'useragent'), + 'timeout' => Configuration::getConfig('http', 'timeout'), + 'headers' => [], + 'proxy' => null, + 'curl_options' => [], + 'if_not_modified_since' => null, + 'retries' => 3, + ]; + $config = array_merge($defaults, $config); - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_MAXREDIRS, 5); - curl_setopt($ch, CURLOPT_HEADER, false); - curl_setopt($ch, CURLOPT_HTTPHEADER, $config['headers']); - curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']); - curl_setopt($ch, CURLOPT_TIMEOUT, $config['timeout']); - curl_setopt($ch, CURLOPT_ENCODING, ''); - curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS); - if($config['proxy']) { - curl_setopt($ch, CURLOPT_PROXY, $config['proxy']); - } - if (curl_setopt_array($ch, $config['curl_options']) === false) { - throw new \Exception('Tried to set an illegal curl option'); - } + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_MAXREDIRS, 5); + curl_setopt($ch, CURLOPT_HEADER, false); + curl_setopt($ch, CURLOPT_HTTPHEADER, $config['headers']); + curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']); + curl_setopt($ch, CURLOPT_TIMEOUT, $config['timeout']); + curl_setopt($ch, CURLOPT_ENCODING, ''); + curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS); + if ($config['proxy']) { + curl_setopt($ch, CURLOPT_PROXY, $config['proxy']); + } + if (curl_setopt_array($ch, $config['curl_options']) === false) { + throw new \Exception('Tried to set an illegal curl option'); + } - if ($config['if_not_modified_since']) { - curl_setopt($ch, CURLOPT_TIMEVALUE, $config['if_not_modified_since']); - curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE); - } + if ($config['if_not_modified_since']) { + curl_setopt($ch, CURLOPT_TIMEVALUE, $config['if_not_modified_since']); + curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE); + } - $responseStatusLines = []; - $responseHeaders = []; - curl_setopt($ch, CURLOPT_HEADERFUNCTION, function ($ch, $rawHeader) use (&$responseHeaders, &$responseStatusLines) { - $len = strlen($rawHeader); - if ($rawHeader === "\r\n") { - return $len; - } - if (preg_match('#^HTTP/(2|1.1|1.0)#', $rawHeader)) { - $responseStatusLines[] = $rawHeader; - return $len; - } - $header = explode(':', $rawHeader); - if (count($header) === 1) { - return $len; - } - $name = mb_strtolower(trim($header[0])); - $value = trim(implode(':', array_slice($header, 1))); - if (!isset($responseHeaders[$name])) { - $responseHeaders[$name] = []; - } - $responseHeaders[$name][] = $value; - return $len; - }); + $responseStatusLines = []; + $responseHeaders = []; + curl_setopt($ch, CURLOPT_HEADERFUNCTION, function ($ch, $rawHeader) use (&$responseHeaders, &$responseStatusLines) { + $len = strlen($rawHeader); + if ($rawHeader === "\r\n") { + return $len; + } + if (preg_match('#^HTTP/(2|1.1|1.0)#', $rawHeader)) { + $responseStatusLines[] = $rawHeader; + return $len; + } + $header = explode(':', $rawHeader); + if (count($header) === 1) { + return $len; + } + $name = mb_strtolower(trim($header[0])); + $value = trim(implode(':', array_slice($header, 1))); + if (!isset($responseHeaders[$name])) { + $responseHeaders[$name] = []; + } + $responseHeaders[$name][] = $value; + return $len; + }); - $attempts = 0; - while(true) { - $attempts++; - $data = curl_exec($ch); - if ($data !== false) { - // The network call was successful, so break out of the loop - break; - } - if ($attempts > $config['retries']) { - // Finally give up - throw new HttpException(sprintf('%s (%s)', curl_error($ch), curl_errno($ch))); - } - } + $attempts = 0; + while (true) { + $attempts++; + $data = curl_exec($ch); + if ($data !== false) { + // The network call was successful, so break out of the loop + break; + } + if ($attempts > $config['retries']) { + // Finally give up + throw new HttpException(sprintf('%s (%s)', curl_error($ch), curl_errno($ch))); + } + } - $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); - curl_close($ch); - return [ - 'code' => $statusCode, - 'status_lines' => $responseStatusLines, - 'headers' => $responseHeaders, - 'body' => $data, - ]; + $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + return [ + 'code' => $statusCode, + 'status_lines' => $responseStatusLines, + 'headers' => $responseHeaders, + 'body' => $data, + ]; } /** @@ -243,28 +245,31 @@ function _http_request(string $url, array $config = []): array * tags when returning plaintext. * @return false|simple_html_dom Contents as simplehtmldom object. */ -function getSimpleHTMLDOM($url, - $header = array(), - $opts = array(), - $lowercase = true, - $forceTagsClosed = true, - $target_charset = DEFAULT_TARGET_CHARSET, - $stripRN = true, - $defaultBRText = DEFAULT_BR_TEXT, - $defaultSpanText = DEFAULT_SPAN_TEXT){ - - $content = getContents( - $url, - $header ?? [], - $opts ?? [] - ); - return str_get_html($content, - $lowercase, - $forceTagsClosed, - $target_charset, - $stripRN, - $defaultBRText, - $defaultSpanText); +function getSimpleHTMLDOM( + $url, + $header = [], + $opts = [], + $lowercase = true, + $forceTagsClosed = true, + $target_charset = DEFAULT_TARGET_CHARSET, + $stripRN = true, + $defaultBRText = DEFAULT_BR_TEXT, + $defaultSpanText = DEFAULT_SPAN_TEXT +) { + $content = getContents( + $url, + $header ?? [], + $opts ?? [] + ); + return str_get_html( + $content, + $lowercase, + $forceTagsClosed, + $target_charset, + $stripRN, + $defaultBRText, + $defaultSpanText + ); } /** @@ -297,53 +302,58 @@ function getSimpleHTMLDOM($url, * tags when returning plaintext. * @return false|simple_html_dom Contents as simplehtmldom object. */ -function getSimpleHTMLDOMCached($url, - $duration = 86400, - $header = array(), - $opts = array(), - $lowercase = true, - $forceTagsClosed = true, - $target_charset = DEFAULT_TARGET_CHARSET, - $stripRN = true, - $defaultBRText = DEFAULT_BR_TEXT, - $defaultSpanText = DEFAULT_SPAN_TEXT){ - - Debug::log('Caching url ' . $url . ', duration ' . $duration); +function getSimpleHTMLDOMCached( + $url, + $duration = 86400, + $header = [], + $opts = [], + $lowercase = true, + $forceTagsClosed = true, + $target_charset = DEFAULT_TARGET_CHARSET, + $stripRN = true, + $defaultBRText = DEFAULT_BR_TEXT, + $defaultSpanText = DEFAULT_SPAN_TEXT +) { + Debug::log('Caching url ' . $url . ', duration ' . $duration); - // Initialize cache - $cacheFac = new CacheFactory(); + // Initialize cache + $cacheFac = new CacheFactory(); - $cache = $cacheFac->create(Configuration::getConfig('cache', 'type')); - $cache->setScope('pages'); - $cache->purgeCache(86400); // 24 hours (forced) + $cache = $cacheFac->create(Configuration::getConfig('cache', 'type')); + $cache->setScope('pages'); + $cache->purgeCache(86400); // 24 hours (forced) - $params = array($url); - $cache->setKey($params); + $params = [$url]; + $cache->setKey($params); - // Determine if cached file is within duration - $time = $cache->getTime(); - if($time !== false - && (time() - $duration < $time) - && !Debug::isEnabled()) { // Contents within duration - $content = $cache->loadData(); - } else { // Content not within duration - $content = getContents( - $url, - $header ?? [], - $opts ?? [] - ); - if($content !== false) { - $cache->saveData($content); - } - } + // Determine if cached file is within duration + $time = $cache->getTime(); + if ( + $time !== false + && (time() - $duration < $time) + && !Debug::isEnabled() + ) { // Contents within duration + $content = $cache->loadData(); + } else { // Content not within duration + $content = getContents( + $url, + $header ?? [], + $opts ?? [] + ); + if ($content !== false) { + $cache->saveData($content); + } + } - return str_get_html($content, - $lowercase, - $forceTagsClosed, - $target_charset, - $stripRN, - $defaultBRText, - $defaultSpanText); + return str_get_html( + $content, + $lowercase, + $forceTagsClosed, + $target_charset, + $stripRN, + $defaultBRText, + $defaultSpanText + ); } /** @@ -360,49 +370,53 @@ function getSimpleHTMLDOMCached($url, * @param string $url The URL or path to the file. * @return string The MIME type of the file. */ -function getMimeType($url) { - static $mime = null; +function getMimeType($url) +{ + static $mime = null; - if (is_null($mime)) { - // Default values, overriden by /etc/mime.types when present - $mime = array( - 'jpg' => 'image/jpeg', - 'gif' => 'image/gif', - 'png' => 'image/png', - 'image' => 'image/*', - 'mp3' => 'audio/mpeg', - ); - // '@' is used to mute open_basedir warning, see issue #818 - if (@is_readable('/etc/mime.types')) { - $file = fopen('/etc/mime.types', 'r'); - while(($line = fgets($file)) !== false) { - $line = trim(preg_replace('/#.*/', '', $line)); - if(!$line) - continue; - $parts = preg_split('/\s+/', $line); - if(count($parts) == 1) - continue; - $type = array_shift($parts); - foreach($parts as $part) - $mime[$part] = $type; - } - fclose($file); - } - } + if (is_null($mime)) { + // Default values, overriden by /etc/mime.types when present + $mime = [ + 'jpg' => 'image/jpeg', + 'gif' => 'image/gif', + 'png' => 'image/png', + 'image' => 'image/*', + 'mp3' => 'audio/mpeg', + ]; + // '@' is used to mute open_basedir warning, see issue #818 + if (@is_readable('/etc/mime.types')) { + $file = fopen('/etc/mime.types', 'r'); + while (($line = fgets($file)) !== false) { + $line = trim(preg_replace('/#.*/', '', $line)); + if (!$line) { + continue; + } + $parts = preg_split('/\s+/', $line); + if (count($parts) == 1) { + continue; + } + $type = array_shift($parts); + foreach ($parts as $part) { + $mime[$part] = $type; + } + } + fclose($file); + } + } - if (strpos($url, '?') !== false) { - $url_temp = substr($url, 0, strpos($url, '?')); - if (strpos($url, '#') !== false) { - $anchor = substr($url, strpos($url, '#')); - $url_temp .= $anchor; - } - $url = $url_temp; - } + if (strpos($url, '?') !== false) { + $url_temp = substr($url, 0, strpos($url, '?')); + if (strpos($url, '#') !== false) { + $anchor = substr($url, strpos($url, '#')); + $url_temp .= $anchor; + } + $url = $url_temp; + } - $ext = strtolower(pathinfo($url, PATHINFO_EXTENSION)); - if (!empty($mime[$ext])) { - return $mime[$ext]; - } + $ext = strtolower(pathinfo($url, PATHINFO_EXTENSION)); + if (!empty($mime[$ext])) { + return $mime[$ext]; + } - return 'application/octet-stream'; + return 'application/octet-stream'; } |