aboutsummaryrefslogtreecommitdiff
path: root/lib/html.php
diff options
context:
space:
mode:
authorGravatar ORelio <ORelio@users.noreply.github.com> 2022-11-20 12:41:59 +0100
committerGravatar GitHub <noreply@github.com> 2022-11-20 12:41:59 +0100
commitd592e2cb152f144a2e323e836d359df7591a6ac0 (patch)
tree38c6e88a6610deb7af2f14b0914e8dee6f21d6b5 /lib/html.php
parent2f7f13d9fe95da680900d8cdc597e01b7fbbb2f9 (diff)
downloadrss-bridge-d592e2cb152f144a2e323e836d359df7591a6ac0.tar.gz
rss-bridge-d592e2cb152f144a2e323e836d359df7591a6ac0.tar.zst
rss-bridge-d592e2cb152f144a2e323e836d359df7591a6ac0.zip
[Core] Add html/convertLazyLoading (+ document stripRecursiveHTMLSection) (#3157)
* [core] Add html/convertLazyLoading($dom) Looks for lazy-loading attributes such as 'data-src' and converts them back to regular ones such as 'src', easier for RSS readers. It also converts <picture> elements to plain <img> elements. * [core] Document html/stripRecursiveHTMLSection() Add documentation for that function (no code changes). * [WordPressBridge] Use convertLazyLoading() * [WordPressBridge] Unwrap image figures <img> inside <figure> may not display on RSS readers. This converts them back to <img>, without losing caption if present. * [ZDNet] Convert lazy loading images * [code] html/stripRecursiveHTMLSection: Fix typo
Diffstat (limited to 'lib/html.php')
-rw-r--r--lib/html.php85
1 files changed, 84 insertions, 1 deletions
diff --git a/lib/html.php b/lib/html.php
index 873620bd..8ec30069 100644
--- a/lib/html.php
+++ b/lib/html.php
@@ -201,6 +201,69 @@ function defaultLinkTo($dom, $url)
}
/**
+ * Convert lazy-loading images and frames (video embeds) into static elements
+ *
+ * This function looks for lazy-loading attributes such as 'data-src' and converts
+ * them back to regular ones such as 'src', making them loadable in RSS readers.
+ * It also converts <picture> elements to plain <img> elements.
+ *
+ * @param string|object $content The HTML content. Supports HTML objects or string objects
+ * @return string|object Content with fixed image/frame URLs (same type as input).
+ */
+function convertLazyLoading($dom)
+{
+ $string_convert = false;
+ if (is_string($dom)) {
+ $string_convert = true;
+ $dom = str_get_html($dom);
+ }
+
+ // Process standalone images, embeds and picture sources
+ foreach ($dom->find('img, iframe, source') as $img) {
+ if (!empty($img->getAttribute('data-src'))) {
+ $img->src = $img->getAttribute('data-src');
+ } elseif (!empty($img->getAttribute('data-srcset'))) {
+ $img->src = explode(' ', $img->getAttribute('data-srcset'))[0];
+ } elseif (!empty($img->getAttribute('data-lazy-src'))) {
+ $img->src = $img->getAttribute('data-lazy-src');
+ } elseif (!empty($img->getAttribute('srcset'))) {
+ $img->src = explode(' ', $img->getAttribute('srcset'))[0];
+ } else {
+ continue; // Proceed to next element without removing attributes
+ }
+ foreach (['loading', 'decoding', 'srcset', 'data-src', 'data-srcset'] as $attr) {
+ if ($img->hasAttribute($attr)) {
+ $img->removeAttribute($attr);
+ }
+ }
+ }
+
+ // Convert complex HTML5 pictures to plain, standalone images
+ // <img> and <source> tags already have their "src" attribute set at this point,
+ // so we replace the whole <picture> with a standalone <img> from within the <picture>
+ foreach ($dom->find('picture') as $picture) {
+ $img = $picture->find('img, source', 0);
+ if (!empty($img)) {
+ if ($img->tag == 'source') {
+ $img->tag = 'img';
+ }
+ // Adding/removing node would change its position inside the parent element,
+ // So instead we rewrite the node in-place though the outertext attribute
+ $picture->outertext = $img->outertext;
+ }
+ }
+
+ // If the expected return type is object, reload the DOM to make sure
+ // all $picture->outertext rewritten above are converted back to objects
+ $dom = $dom->outertext;
+ if (!$string_convert) {
+ $dom = str_get_html($dom);
+ }
+
+ return $dom;
+}
+
+/**
* Extract the first part of a string matching the specified start and end delimiters
*
* @param string $string Input string, e.g. `<div>Post author: John Doe</div>`
@@ -245,27 +308,47 @@ function stripWithDelimiters($string, $start, $end)
* @param string $tag_start Start of the HTML tag to remove, e.g. `<div class="ads">`
* @return string Cleaned String, e.g. `foobar`
*
- * @todo This function needs more documentation to make it maintainable.
+ * This function works by locating the desired tag start, then finding the appropriate
+ * end by counting opening and ending tags until the amount of open tags reaches zero:
+ *
+ * ```
+ * Amount of open tags:
+ * 1 2 1 0
+ * |---------------||---| |----| |----|
+ * <div class="ads"><div>ads</div>ads</div>bar
+ * | <-------- Section to remove -------> |
+ * ```
*/
function stripRecursiveHTMLSection($string, $tag_name, $tag_start)
{
$open_tag = '<' . $tag_name;
$close_tag = '</' . $tag_name . '>';
$close_tag_length = strlen($close_tag);
+
+ // Make sure the provided $tag_start argument matches the provided $tag_name argument
if (strpos($tag_start, $open_tag) === 0) {
+ // While tag_start is present, there is at least one remaining section to remove
while (strpos($string, $tag_start) !== false) {
+ // In order to locate the end of the section, we attempt each closing tag until we find the right one
+ // We know we found the right one when the amount of "<tag" is the same as amount of "</tag"
+ // When the attempted "</tag" is not the correct one, we increase $search_offset to skip it
+ // and retry unless $max_recursion is reached (prevents infinite loop on malformed HTML)
$max_recursion = 100;
$section_to_remove = null;
$section_start = strpos($string, $tag_start);
$search_offset = $section_start;
do {
$max_recursion--;
+ // Move on to the next occurrence of "</tag"
$section_end = strpos($string, $close_tag, $search_offset);
$search_offset = $section_end + $close_tag_length;
+ // If the next "</tag" is the correct one, then this is the section we must remove:
$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
+ // Count amount of "<tag" and "</tag" in the section to remove
$open_tag_count = substr_count($section_to_remove, $open_tag);
$close_tag_count = substr_count($section_to_remove, $close_tag);
} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
+ // We exited the loop, let's remove the section
$string = str_replace($section_to_remove, '', $string);
}
}