diff options
author | 2024-03-15 18:04:24 -0700 | |
---|---|---|
committer | 2024-03-15 18:40:06 -0700 | |
commit | 4834e934f2cf57b106923bd37d62d6c5f6f39f1f (patch) | |
tree | 0674e4bfb3210aef67d3afd4a51a9cd07fb31fc6 | |
parent | dd4fb660c19fd1f6ce5716f9f5783eb7565fed2d (diff) | |
download | v2-4834e934f2cf57b106923bd37d62d6c5f6f39f1f.tar.gz v2-4834e934f2cf57b106923bd37d62d6c5f6f39f1f.tar.zst v2-4834e934f2cf57b106923bd37d62d6c5f6f39f1f.zip |
Remove some duplicated code in RSS parser
-rw-r--r-- | internal/reader/rss/adapter.go | 10 | ||||
-rw-r--r-- | internal/reader/rss/atom.go | 37 | ||||
-rw-r--r-- | internal/reader/rss/parser_test.go | 100 | ||||
-rw-r--r-- | internal/reader/rss/rss.go | 144 |
4 files changed, 227 insertions, 64 deletions
diff --git a/internal/reader/rss/adapter.go b/internal/reader/rss/adapter.go index 2909fc6b..531cc53f 100644 --- a/internal/reader/rss/adapter.go +++ b/internal/reader/rss/adapter.go @@ -39,7 +39,7 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed { // Try to find the feed URL from the Atom links. for _, atomLink := range r.rss.Channel.AtomLinks.Links { - atomLinkHref := strings.TrimSpace(atomLink.URL) + atomLinkHref := strings.TrimSpace(atomLink.Href) if atomLinkHref != "" && atomLink.Rel == "self" { if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil { feed.FeedURL = absoluteFeedURL @@ -170,8 +170,8 @@ func findEntryURL(rssItem *RSSItem) string { } for _, atomLink := range rssItem.AtomLinks.Links { - if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { - return strings.TrimSpace(atomLink.URL) + if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { + return strings.TrimSpace(atomLink.Href) } } @@ -233,8 +233,8 @@ func findEntryAuthor(rssItem *RSSItem) string { author = rssItem.ItunesAuthor case rssItem.DublinCoreCreator != "": author = rssItem.DublinCoreCreator - case rssItem.AtomAuthor.String() != "": - author = rssItem.AtomAuthor.String() + case rssItem.AtomAuthor.PersonName() != "": + author = rssItem.AtomAuthor.PersonName() case strings.Contains(rssItem.Author.Inner, "<![CDATA["): author = rssItem.Author.Data default: diff --git a/internal/reader/rss/atom.go b/internal/reader/rss/atom.go index e0d66910..27dade47 100644 --- a/internal/reader/rss/atom.go +++ b/internal/reader/rss/atom.go @@ -3,41 +3,18 @@ package rss // import "miniflux.app/v2/internal/reader/rss" -import "strings" +import ( + "miniflux.app/v2/internal/reader/atom" +) type AtomAuthor struct { - Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"` + Author atom.AtomPerson `xml:"http://www.w3.org/2005/Atom author"` } -func (a *AtomAuthor) String() string { - return a.Author.String() -} - -type AtomPerson struct { - Name string `xml:"name"` - Email string `xml:"email"` -} - -func (a *AtomPerson) String() string { - var name string - - switch { - case a.Name != "": - name = a.Name - case a.Email != "": - name = a.Email - } - - return strings.TrimSpace(name) -} - -type AtomLink struct { - URL string `xml:"href,attr"` - Type string `xml:"type,attr"` - Rel string `xml:"rel,attr"` - Length string `xml:"length,attr"` +func (a *AtomAuthor) PersonName() string { + return a.Author.PersonName() } type AtomLinks struct { - Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"` + Links []*atom.AtomLink `xml:"http://www.w3.org/2005/Atom link"` } diff --git a/internal/reader/rss/parser_test.go b/internal/reader/rss/parser_test.go index 7e9413b0..d9dacac6 100644 --- a/internal/reader/rss/parser_test.go +++ b/internal/reader/rss/parser_test.go @@ -746,6 +746,106 @@ func TestParseEntryWithContentEncoded(t *testing.T) { } } +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryDescriptionWithEncodedHTMLTags(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <description>this is <b>bold</b></description> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `this is <b>bold</b>` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryWithDescriptionWithHTMLCDATA(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <description><![CDATA[this is <b>bold</b>]]></description> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `this is <b>bold</b>` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryDescriptionWithEncodingAngleBracketsInText(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <description>5 &lt; 8, ticker symbol &lt;BIGCO&gt;</description> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `5 < 8, ticker symbol <BIGCO>` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryDescriptionWithEncodingAngleBracketsWithinCDATASection(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <description><![CDATA[5 < 8, ticker symbol <BIGCO>]]></description> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `5 < 8, ticker symbol <BIGCO>` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + func TestParseEntryWithFeedBurnerLink(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0"> diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index 7935166d..bc99b461 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -16,29 +16,75 @@ import ( // Specs: https://www.rssboard.org/rss-specification type RSS struct { - Version string `xml:"rss version,attr"` + // Version is the version of the RSS specification. + Version string `xml:"rss version,attr"` + + // Channel is the main container for the RSS feed. Channel RSSChannel `xml:"rss channel"` } type RSSChannel struct { - Title string `xml:"rss title"` - Link string `xml:"rss link"` - Description string `xml:"rss description"` - Language string `xml:"rss language"` - Copyright string `xml:"rss copyRight"` - ManagingEditor string `xml:"rss managingEditor"` - Webmaster string `xml:"rss webMaster"` - PubDate string `xml:"rss pubDate"` - LastBuildDate string `xml:"rss lastBuildDate"` - Categories []string `xml:"rss category"` - Generator string `xml:"rss generator"` - Docs string `xml:"rss docs"` - Cloud *RSSCloud `xml:"rss cloud"` - Image *RSSImage `xml:"rss image"` - TTL string `xml:"rss ttl"` - SkipHours []string `xml:"rss skipHours>hour"` - SkipDays []string `xml:"rss skipDays>day"` - Items []RSSItem `xml:"rss item"` + // Title is the name of the channel. + Title string `xml:"rss title"` + + // Link is the URL to the HTML website corresponding to the channel. + Link string `xml:"rss link"` + + // Description is a phrase or sentence describing the channel. + Description string `xml:"rss description"` + + // Language is the language the channel is written in. + // A list of allowable values for this element, as provided by Netscape, is here: https://www.rssboard.org/rss-language-codes. + // You may also use values defined by the W3C: https://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes. + Language string `xml:"rss language"` + + // Copyright is a string indicating the copyright. + Copyright string `xml:"rss copyRight"` + + // ManagingEditor is the email address for the person responsible for editorial content. + ManagingEditor string `xml:"rss managingEditor"` + + // Webmaster is the email address for the person responsible for technical issues relating to the channel. + Webmaster string `xml:"rss webMaster"` + + // PubDate is the publication date for the content in the channel. + // All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred). + PubDate string `xml:"rss pubDate"` + + // LastBuildDate is the last time the content of the channel changed. + LastBuildDate string `xml:"rss lastBuildDate"` + + // Categories is a collection of categories to which the channel belongs. + Categories []string `xml:"rss category"` + + // Generator is a string indicating the program used to generate the channel. + Generator string `xml:"rss generator"` + + // Docs is a URL that points to the documentation for the format used in the RSS file. + DocumentationURL string `xml:"rss docs"` + + // Cloud is a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1. + Cloud *RSSCloud `xml:"rss cloud"` + + // Image specifies a GIF, JPEG or PNG image that can be displayed with the channel. + Image *RSSImage `xml:"rss image"` + + // TTL is a number of minutes that indicates how long a channel can be cached before refreshing from the source. + TTL string `xml:"rss ttl"` + + // SkipHours is a hint for aggregators telling them which hours they can skip. + // An XML element that contains up to 24 <hour> sub-elements whose value is a number between 0 and 23, + // representing a time in GMT, when aggregators, + // if they support the feature, may not read the channel on hours listed in the skipHours element. + SkipHours []string `xml:"rss skipHours>hour"` + + // SkipDays is a hint for aggregators telling them which days they can skip. + // An XML element that contains up to seven <day> sub-elements whose value is Monday, Tuesday, Wednesday, Thursday, Friday, Saturday or Sunday. + SkipDays []string `xml:"rss skipDays>day"` + + // Items is a collection of items. + Items []RSSItem `xml:"rss item"` + AtomLinks itunes.ItunesChannelElement googleplay.GooglePlayChannelElement @@ -64,16 +110,56 @@ type RSSImage struct { } type RSSItem struct { - Title string `xml:"rss title"` - Link string `xml:"rss link"` - Description string `xml:"rss description"` - Author RSSAuthor `xml:"rss author"` - Categories []string `xml:"rss category"` - CommentsURL string `xml:"rss comments"` - Enclosures []RSSEnclosure `xml:"rss enclosure"` - GUID RSSGUID `xml:"rss guid"` - PubDate string `xml:"rss pubDate"` - Source RSSSource `xml:"rss source"` + // Title is the title of the item. + Title string `xml:"rss title"` + + // Link is the URL of the item. + Link string `xml:"rss link"` + + // Description is the item synopsis. + Description string `xml:"rss description"` + + // Author is the email address of the author of the item. + Author RSSAuthor `xml:"rss author"` + + // <category> is an optional sub-element of <item>. + // It has one optional attribute, domain, a string that identifies a categorization taxonomy. + Categories []string `xml:"rss category"` + + // <comments> is an optional sub-element of <item>. + // If present, it contains the URL of the comments page for the item. + CommentsURL string `xml:"rss comments"` + + // <enclosure> is an optional sub-element of <item>. + // It has three required attributes. url says where the enclosure is located, + // length says how big it is in bytes, and type says what its type is, a standard MIME type. + Enclosures []RSSEnclosure `xml:"rss enclosure"` + + // <guid> is an optional sub-element of <item>. + // It's a string that uniquely identifies the item. + // When present, an aggregator may choose to use this string to determine if an item is new. + // + // There are no rules for the syntax of a guid. + // Aggregators must view them as a string. + // It's up to the source of the feed to establish the uniqueness of the string. + // + // If the guid element has an attribute named isPermaLink with a value of true, + // the reader may assume that it is a permalink to the item, that is, a url that can be opened in a Web browser, + // that points to the full item described by the <item> element. + // + // isPermaLink is optional, its default value is true. + // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular. + GUID RSSGUID `xml:"rss guid"` + + // <pubDate> is the publication date of the item. + // Its value is a string in RFC 822 format. + PubDate string `xml:"rss pubDate"` + + // <source> is an optional sub-element of <item>. + // Its value is the name of the RSS channel that the item came from, derived from its <title>. + // It has one required attribute, url, which contains the URL of the RSS channel. + Source RSSSource `xml:"rss source"` + dublincore.DublinCoreItemElement FeedBurnerItemElement media.MediaItemElement |