aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar JohnnyJayJay <johnnyjayjay02@gmail.com> 2024-06-19 20:38:24 +0200
committerGravatar Frédéric Guillot <f@miniflux.net> 2024-06-21 14:00:40 -0700
commitee5e18ea9f06c69dc0e37ca077f82b5f315505e8 (patch)
tree24176c9e4e3192ae272a73c8cad78e803a40bff4
parent3ef2522c62d251a464aab5ad0e2b6fb6bafd6469 (diff)
downloadv2-ee5e18ea9f06c69dc0e37ca077f82b5f315505e8.tar.gz
v2-ee5e18ea9f06c69dc0e37ca077f82b5f315505e8.tar.zst
v2-ee5e18ea9f06c69dc0e37ca077f82b5f315505e8.zip
sanitizer: add support for HTML `hidden` attribute
This commit adjusts the `Sanitize` function to skip tags with the `hidden` attribute, similar to how it skips blocked tags and their contents.
-rw-r--r--internal/reader/sanitizer/sanitizer.go19
-rw-r--r--internal/reader/sanitizer/sanitizer_test.go10
2 files changed, 20 insertions, 9 deletions
diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go
index 7f063f04..962111b2 100644
--- a/internal/reader/sanitizer/sanitizer.go
+++ b/internal/reader/sanitizer/sanitizer.go
@@ -82,7 +82,7 @@ func Sanitize(baseURL, input string) string {
var buffer strings.Builder
var tagStack []string
var parentTag string
- blacklistedTagDepth := 0
+ var blockedStack []string
tokenizer := html.NewTokenizer(strings.NewReader(input))
for {
@@ -98,7 +98,7 @@ func Sanitize(baseURL, input string) string {
token := tokenizer.Token()
switch token.Type {
case html.TextToken:
- if blacklistedTagDepth > 0 {
+ if len(blockedStack) > 0 {
continue
}
@@ -116,7 +116,10 @@ func Sanitize(baseURL, input string) string {
if isPixelTracker(tagName, token.Attr) {
continue
}
- if isValidTag(tagName) {
+
+ if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
+ blockedStack = append(blockedStack, tagName)
+ } else if len(blockedStack) == 0 && isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
if hasRequiredAttributes(tagName, attrNames) {
@@ -128,22 +131,20 @@ func Sanitize(baseURL, input string) string {
tagStack = append(tagStack, tagName)
}
- } else if isBlockedTag(tagName) {
- blacklistedTagDepth++
}
case html.EndTagToken:
tagName := token.DataAtom.String()
- if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
+ if len(blockedStack) > 0 && blockedStack[len(blockedStack)-1] == tagName {
+ blockedStack = blockedStack[:len(blockedStack)-1]
+ } else if len(blockedStack) == 0 && isValidTag(tagName) && slices.Contains(tagStack, tagName) {
buffer.WriteString("</" + tagName + ">")
- } else if isBlockedTag(tagName) {
- blacklistedTagDepth--
}
case html.SelfClosingTagToken:
tagName := token.DataAtom.String()
if isPixelTracker(tagName, token.Attr) {
continue
}
- if isValidTag(tagName) {
+ if isValidTag(tagName) && len(blockedStack) == 0 {
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 {
diff --git a/internal/reader/sanitizer/sanitizer_test.go b/internal/reader/sanitizer/sanitizer_test.go
index 07207a44..8289e602 100644
--- a/internal/reader/sanitizer/sanitizer_test.go
+++ b/internal/reader/sanitizer/sanitizer_test.go
@@ -630,3 +630,13 @@ func TestReplaceStyle(t *testing.T) {
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
}
}
+
+func TestHiddenParagraph(t *testing.T) {
+ input := `<p>Before paragraph.</p><p hidden>This should <em>not</em> appear in the <strong>output</strong></p><p>After paragraph.</p>`
+ expected := `<p>Before paragraph.</p><p>After paragraph.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}