diff options
author | 2023-09-08 16:50:06 -0700 | |
---|---|---|
committer | 2023-09-08 17:39:49 -0700 | |
commit | 36f013670efa627883f8de3d03ff93b7b119baff (patch) | |
tree | 8e403c592f906a290ebeb756bf699bbbb9e5694f /internal | |
parent | 344a237af87e07c51ca73e3b6f1c23598613996d (diff) | |
download | v2-36f013670efa627883f8de3d03ff93b7b119baff.tar.gz v2-36f013670efa627883f8de3d03ff93b7b119baff.tar.zst v2-36f013670efa627883f8de3d03ff93b7b119baff.zip |
Strip HTML tags from DublinCore Creator tags
Diffstat (limited to 'internal')
-rw-r--r-- | internal/reader/dublincore/dublincore.go (renamed from internal/reader/rdf/dublincore.go) | 20 | ||||
-rw-r--r-- | internal/reader/rdf/parser_test.go | 28 | ||||
-rw-r--r-- | internal/reader/rdf/rdf.go | 9 | ||||
-rw-r--r-- | internal/reader/rss/dublincore.go | 11 | ||||
-rw-r--r-- | internal/reader/rss/rss.go | 5 |
5 files changed, 53 insertions, 20 deletions
diff --git a/internal/reader/rdf/dublincore.go b/internal/reader/dublincore/dublincore.go index 6d0112c3..e2e2607c 100644 --- a/internal/reader/rdf/dublincore.go +++ b/internal/reader/dublincore/dublincore.go @@ -1,16 +1,30 @@ // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -package rdf // import "miniflux.app/v2/internal/reader/rdf" +package dublincore // import "miniflux.app/v2/internal/reader/dublincore" + +import ( + "strings" + + "miniflux.app/v2/internal/reader/sanitizer" +) // DublinCoreFeedElement represents Dublin Core feed XML elements. type DublinCoreFeedElement struct { DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ channel>creator"` } -// DublinCoreEntryElement represents Dublin Core entry XML elements. -type DublinCoreEntryElement struct { +func (feed *DublinCoreFeedElement) GetSanitizedCreator() string { + return strings.TrimSpace(sanitizer.StripTags(feed.DublinCoreCreator)) +} + +// DublinCoreItemElement represents Dublin Core entry XML elements. +type DublinCoreItemElement struct { DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"` DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` } + +func (item *DublinCoreItemElement) GetSanitizedCreator() string { + return strings.TrimSpace(sanitizer.StripTags(item.DublinCoreCreator)) +} diff --git a/internal/reader/rdf/parser_test.go b/internal/reader/rdf/parser_test.go index 52565956..67b8c569 100644 --- a/internal/reader/rdf/parser_test.go +++ b/internal/reader/rdf/parser_test.go @@ -349,6 +349,34 @@ func TestParseItemWithDublicCoreDate(t *testing.T) { } } +func TestParseItemWithEncodedHTMLInDCCreatorField(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/"> + <channel> + <title>Example</title> + <link>http://example.org</link> + </channel> + + <item> + <title>Title</title> + <description>Test</description> + <link>http://example.org/test.html</link> + <dc:creator><a href="http://example.org/author1">Author 1</a> (University 1), <a href="http://example.org/author2">Author 2</a> (University 2)</dc:creator> + <dc:date>2018-04-10T05:00:00+00:00</dc:date> + </item> + </rdf:RDF>` + + feed, err := Parse("http://example.org", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expectedAuthor := "Author 1 (University 1), Author 2 (University 2)" + if feed.Entries[0].Author != expectedAuthor { + t.Errorf("Incorrect entry author, got: %s, want: %s", feed.Entries[0].Author, expectedAuthor) + } +} + func TestParseItemWithoutDate(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"> diff --git a/internal/reader/rdf/rdf.go b/internal/reader/rdf/rdf.go index 6118ec20..935d0c0c 100644 --- a/internal/reader/rdf/rdf.go +++ b/internal/reader/rdf/rdf.go @@ -13,6 +13,7 @@ import ( "miniflux.app/v2/internal/logger" "miniflux.app/v2/internal/model" "miniflux.app/v2/internal/reader/date" + "miniflux.app/v2/internal/reader/dublincore" "miniflux.app/v2/internal/reader/sanitizer" "miniflux.app/v2/internal/urllib" ) @@ -22,7 +23,7 @@ type rdfFeed struct { Title string `xml:"channel>title"` Link string `xml:"channel>link"` Items []rdfItem `xml:"item"` - DublinCoreFeedElement + dublincore.DublinCoreFeedElement } func (r *rdfFeed) Transform(baseURL string) *model.Feed { @@ -38,7 +39,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed { for _, item := range r.Items { entry := item.Transform() if entry.Author == "" && r.DublinCoreCreator != "" { - entry.Author = strings.TrimSpace(r.DublinCoreCreator) + entry.Author = r.GetSanitizedCreator() } if entry.URL == "" { @@ -60,7 +61,7 @@ type rdfItem struct { Title string `xml:"title"` Link string `xml:"link"` Description string `xml:"description"` - DublinCoreEntryElement + dublincore.DublinCoreItemElement } func (r *rdfItem) Transform() *model.Entry { @@ -88,7 +89,7 @@ func (r *rdfItem) entryContent() string { } func (r *rdfItem) entryAuthor() string { - return strings.TrimSpace(r.DublinCoreCreator) + return r.GetSanitizedCreator() } func (r *rdfItem) entryURL() string { diff --git a/internal/reader/rss/dublincore.go b/internal/reader/rss/dublincore.go deleted file mode 100644 index e8a8d243..00000000 --- a/internal/reader/rss/dublincore.go +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package rss // import "miniflux.app/v2/internal/reader/rss" - -// DublinCoreElement represents Dublin Core XML elements. -type DublinCoreElement struct { - DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"` - DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` - DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` -} diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index 93584bf0..323c6041 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -15,6 +15,7 @@ import ( "miniflux.app/v2/internal/logger" "miniflux.app/v2/internal/model" "miniflux.app/v2/internal/reader/date" + "miniflux.app/v2/internal/reader/dublincore" "miniflux.app/v2/internal/reader/media" "miniflux.app/v2/internal/reader/sanitizer" "miniflux.app/v2/internal/urllib" @@ -182,7 +183,7 @@ type rssItem struct { CommentLinks []rssCommentLink `xml:"comments"` EnclosureLinks []rssEnclosure `xml:"enclosure"` Categories []rssCategory `xml:"category"` - DublinCoreElement + dublincore.DublinCoreItemElement FeedBurnerElement PodcastEntryElement media.Element @@ -250,7 +251,7 @@ func (r *rssItem) entryAuthor() string { } if author == "" { - author = r.DublinCoreCreator + author = r.GetSanitizedCreator() } return sanitizer.StripTags(strings.TrimSpace(author)) |