diff options
-rw-r--r-- | internal/reader/atom/atom_10.go | 2 | ||||
-rw-r--r-- | internal/reader/googleplay/googleplay.go | 2 | ||||
-rw-r--r-- | internal/reader/itunes/itunes.go | 4 | ||||
-rw-r--r-- | internal/reader/media/media.go | 11 | ||||
-rw-r--r-- | internal/reader/rdf/adapter.go | 5 | ||||
-rw-r--r-- | internal/reader/rss/adapter.go | 310 | ||||
-rw-r--r-- | internal/reader/rss/feedburner.go | 4 | ||||
-rw-r--r-- | internal/reader/rss/parser.go | 6 | ||||
-rw-r--r-- | internal/reader/rss/parser_test.go | 107 | ||||
-rw-r--r-- | internal/reader/rss/podcast.go | 3 | ||||
-rw-r--r-- | internal/reader/rss/rss.go | 399 |
11 files changed, 493 insertions, 360 deletions
diff --git a/internal/reader/atom/atom_10.go b/internal/reader/atom/atom_10.go index 5b67e073..798a8748 100644 --- a/internal/reader/atom/atom_10.go +++ b/internal/reader/atom/atom_10.go @@ -91,7 +91,7 @@ type atom10Entry struct { Content atom10Text `xml:"http://www.w3.org/2005/Atom content"` Authors atomAuthors `xml:"author"` Categories []atom10Category `xml:"category"` - media.Element + media.MediaItemElement } func (a *atom10Entry) Transform() *model.Entry { diff --git a/internal/reader/googleplay/googleplay.go b/internal/reader/googleplay/googleplay.go index 38dcc71f..79404efb 100644 --- a/internal/reader/googleplay/googleplay.go +++ b/internal/reader/googleplay/googleplay.go @@ -6,7 +6,7 @@ package googleplay // import "miniflux.app/v2/internal/reader/googleplay" // Specs: // https://support.google.com/googleplay/podcasts/answer/6260341 // https://www.google.com/schemas/play-podcasts/1.0/play-podcasts.xsd -type GooglePlayFeedElement struct { +type GooglePlayChannelElement struct { GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"` GooglePlayEmail string `xml:"http://www.google.com/schemas/play-podcasts/1.0 email"` GooglePlayImage GooglePlayImageElement `xml:"http://www.google.com/schemas/play-podcasts/1.0 image"` diff --git a/internal/reader/itunes/itunes.go b/internal/reader/itunes/itunes.go index 1673f306..87a02f0d 100644 --- a/internal/reader/itunes/itunes.go +++ b/internal/reader/itunes/itunes.go @@ -6,7 +6,7 @@ package itunes // import "miniflux.app/v2/internal/reader/itunes" import "strings" // Specs: https://help.apple.com/itc/podcasts_connect/#/itcb54353390 -type ItunesFeedElement struct { +type ItunesChannelElement struct { ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` ItunesBlock string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd block"` ItunesCategories []ItunesCategoryElement `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd category"` @@ -22,7 +22,7 @@ type ItunesFeedElement struct { ItunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"` } -func (i *ItunesFeedElement) GetItunesCategories() []string { +func (i *ItunesChannelElement) GetItunesCategories() []string { var categories []string for _, category := range i.ItunesCategories { categories = append(categories, category.Text) diff --git a/internal/reader/media/media.go b/internal/reader/media/media.go index df84bf03..7fe4684d 100644 --- a/internal/reader/media/media.go +++ b/internal/reader/media/media.go @@ -11,9 +11,8 @@ import ( var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`) -// Element represents XML media elements. // Specs: https://www.rssboard.org/media-rss -type Element struct { +type MediaItemElement struct { MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"` MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"` MediaThumbnails []Thumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"` @@ -22,7 +21,7 @@ type Element struct { } // AllMediaThumbnails returns all thumbnail elements merged together. -func (e *Element) AllMediaThumbnails() []Thumbnail { +func (e *MediaItemElement) AllMediaThumbnails() []Thumbnail { var items []Thumbnail items = append(items, e.MediaThumbnails...) for _, mediaGroup := range e.MediaGroups { @@ -32,7 +31,7 @@ func (e *Element) AllMediaThumbnails() []Thumbnail { } // AllMediaContents returns all content elements merged together. -func (e *Element) AllMediaContents() []Content { +func (e *MediaItemElement) AllMediaContents() []Content { var items []Content items = append(items, e.MediaContents...) for _, mediaGroup := range e.MediaGroups { @@ -42,7 +41,7 @@ func (e *Element) AllMediaContents() []Content { } // AllMediaPeerLinks returns all peer link elements merged together. -func (e *Element) AllMediaPeerLinks() []PeerLink { +func (e *MediaItemElement) AllMediaPeerLinks() []PeerLink { var items []PeerLink items = append(items, e.MediaPeerLinks...) for _, mediaGroup := range e.MediaGroups { @@ -52,7 +51,7 @@ func (e *Element) AllMediaPeerLinks() []PeerLink { } // FirstMediaDescription returns the first description element. -func (e *Element) FirstMediaDescription() string { +func (e *MediaItemElement) FirstMediaDescription() string { description := e.MediaDescriptions.First() if description != "" { return description diff --git a/internal/reader/rdf/adapter.go b/internal/reader/rdf/adapter.go index 812badbc..bc8c76ed 100644 --- a/internal/reader/rdf/adapter.go +++ b/internal/reader/rdf/adapter.go @@ -28,15 +28,14 @@ func (r *RDFAdapter) BuildFeed(feedURL string) *model.Feed { feed := &model.Feed{ Title: stripTags(r.rdf.Channel.Title), FeedURL: feedURL, + SiteURL: r.rdf.Channel.Link, } if feed.Title == "" { feed.Title = feedURL } - if siteURL, err := urllib.AbsoluteURL(feedURL, r.rdf.Channel.Link); err != nil { - feed.SiteURL = r.rdf.Channel.Link - } else { + if siteURL, err := urllib.AbsoluteURL(feedURL, r.rdf.Channel.Link); err == nil { feed.SiteURL = siteURL } diff --git a/internal/reader/rss/adapter.go b/internal/reader/rss/adapter.go new file mode 100644 index 00000000..5c1785a9 --- /dev/null +++ b/internal/reader/rss/adapter.go @@ -0,0 +1,310 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package rss // import "miniflux.app/v2/internal/reader/rss" + +import ( + "html" + "log/slog" + "path" + "strconv" + "strings" + "time" + + "miniflux.app/v2/internal/crypto" + "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/date" + "miniflux.app/v2/internal/reader/sanitizer" + "miniflux.app/v2/internal/urllib" +) + +type RSSAdapter struct { + rss *RSS +} + +func NewRSSAdapter(rss *RSS) *RSSAdapter { + return &RSSAdapter{rss} +} + +func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed { + feed := &model.Feed{ + Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)), + FeedURL: feedURL, + SiteURL: r.rss.Channel.Link, + } + + if siteURL, err := urllib.AbsoluteURL(feedURL, r.rss.Channel.Link); err == nil { + feed.SiteURL = siteURL + } + + // Try to find the feed URL from the Atom links. + for _, atomLink := range r.rss.Channel.AtomLinks.Links { + atomLinkHref := strings.TrimSpace(atomLink.URL) + if atomLinkHref != "" && atomLink.Rel == "self" { + if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil { + feed.FeedURL = absoluteFeedURL + break + } + } + } + + // Fallback to the site URL if the title is empty. + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + // Get TTL if defined. + if r.rss.Channel.TTL != "" { + if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil { + feed.TTL = ttl + } + } + + // Get the feed icon URL if defined. + if r.rss.Channel.Image != nil { + if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil { + feed.IconURL = absoluteIconURL + } + } + + for _, item := range r.rss.Channel.Items { + entry := model.NewEntry() + entry.Author = findEntryAuthor(&item) + entry.Date = findEntryDate(&item) + entry.Content = findEntryContent(&item) + entry.Enclosures = findEntryEnclosures(&item) + + // Populate the entry URL. + entryURL := findEntryURL(&item) + if entryURL == "" { + entry.URL = feed.SiteURL + } else { + if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entryURL); err == nil { + entry.URL = absoluteEntryURL + } else { + entry.URL = entryURL + } + } + + // Populate the entry title. + entry.Title = findEntryTitle(&item) + if entry.Title == "" { + entry.Title = sanitizer.TruncateHTML(entry.Content, 100) + } + + if entry.Title == "" { + entry.Title = entry.URL + } + + if entry.Author == "" { + entry.Author = findFeedAuthor(&r.rss.Channel) + } + + // Generate the entry hash. + for _, value := range []string{item.GUID.Data, entryURL} { + if value != "" { + entry.Hash = crypto.Hash(value) + break + } + } + + // Find CommentsURL if defined. + if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) { + entry.CommentsURL = absoluteCommentsURL + } + + // Set podcast listening time. + if item.ItunesDuration != "" { + if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil { + entry.ReadingTime = duration + } + } + + // Populate entry categories. + entry.Tags = append(entry.Tags, item.Categories...) + entry.Tags = append(entry.Tags, r.rss.Channel.Categories...) + entry.Tags = append(entry.Tags, r.rss.Channel.GetItunesCategories()...) + + if r.rss.Channel.GooglePlayCategory.Text != "" { + entry.Tags = append(entry.Tags, r.rss.Channel.GooglePlayCategory.Text) + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} + +func findFeedAuthor(rssChannel *RSSChannel) string { + var author string + switch { + case rssChannel.ItunesAuthor != "": + author = rssChannel.ItunesAuthor + case rssChannel.GooglePlayAuthor != "": + author = rssChannel.GooglePlayAuthor + case rssChannel.ItunesOwner.String() != "": + author = rssChannel.ItunesOwner.String() + case rssChannel.ManagingEditor != "": + author = rssChannel.ManagingEditor + case rssChannel.Webmaster != "": + author = rssChannel.Webmaster + } + return sanitizer.StripTags(strings.TrimSpace(author)) +} + +func findEntryTitle(rssItem *RSSItem) string { + title := rssItem.Title + + if rssItem.DublinCoreTitle != "" { + title = rssItem.DublinCoreTitle + } + + return html.UnescapeString(strings.TrimSpace(title)) +} + +func findEntryURL(rssItem *RSSItem) string { + for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} { + if link != "" { + return strings.TrimSpace(link) + } + } + + for _, atomLink := range rssItem.AtomLinks.Links { + if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { + return strings.TrimSpace(atomLink.URL) + } + } + + // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt + // isPermaLink is optional, its default value is true. + // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular. + if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" { + return strings.TrimSpace(rssItem.GUID.Data) + } + + return "" +} + +func findEntryContent(rssItem *RSSItem) string { + for _, value := range []string{ + rssItem.DublinCoreContent, + rssItem.Description, + rssItem.GooglePlayDescription, + rssItem.ItunesSummary, + rssItem.ItunesSubtitle, + } { + if value != "" { + return value + } + } + return "" +} + +func findEntryDate(rssItem *RSSItem) time.Time { + value := rssItem.PubDate + if rssItem.DublinCoreDate != "" { + value = rssItem.DublinCoreDate + } + + if value != "" { + result, err := date.Parse(value) + if err != nil { + slog.Debug("Unable to parse date from RSS feed", + slog.String("date", value), + slog.String("guid", rssItem.GUID.Data), + slog.Any("error", err), + ) + return time.Now() + } + + return result + } + + return time.Now() +} + +func findEntryAuthor(rssItem *RSSItem) string { + var author string + + switch { + case rssItem.GooglePlayAuthor != "": + author = rssItem.GooglePlayAuthor + case rssItem.ItunesAuthor != "": + author = rssItem.ItunesAuthor + case rssItem.DublinCoreCreator != "": + author = rssItem.DublinCoreCreator + case rssItem.AtomAuthor.String() != "": + author = rssItem.AtomAuthor.String() + case strings.Contains(rssItem.Author.Inner, "<![CDATA["): + author = rssItem.Author.Data + default: + author = rssItem.Author.Inner + } + + return strings.TrimSpace(sanitizer.StripTags(author)) +} + +func findEntryEnclosures(rssItem *RSSItem) model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + duplicates := make(map[string]bool) + + for _, mediaThumbnail := range rssItem.AllMediaThumbnails() { + if _, found := duplicates[mediaThumbnail.URL]; !found { + duplicates[mediaThumbnail.URL] = true + enclosures = append(enclosures, &model.Enclosure{ + URL: mediaThumbnail.URL, + MimeType: mediaThumbnail.MimeType(), + Size: mediaThumbnail.Size(), + }) + } + } + + for _, enclosure := range rssItem.Enclosures { + enclosureURL := enclosure.URL + + if rssItem.FeedBurnerEnclosureLink != "" { + filename := path.Base(rssItem.FeedBurnerEnclosureLink) + if strings.Contains(enclosureURL, filename) { + enclosureURL = rssItem.FeedBurnerEnclosureLink + } + } + + if enclosureURL == "" { + continue + } + + if _, found := duplicates[enclosureURL]; !found { + duplicates[enclosureURL] = true + + enclosures = append(enclosures, &model.Enclosure{ + URL: enclosureURL, + MimeType: enclosure.Type, + Size: enclosure.Size(), + }) + } + } + + for _, mediaContent := range rssItem.AllMediaContents() { + if _, found := duplicates[mediaContent.URL]; !found { + duplicates[mediaContent.URL] = true + enclosures = append(enclosures, &model.Enclosure{ + URL: mediaContent.URL, + MimeType: mediaContent.MimeType(), + Size: mediaContent.Size(), + }) + } + } + + for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() { + if _, found := duplicates[mediaPeerLink.URL]; !found { + duplicates[mediaPeerLink.URL] = true + enclosures = append(enclosures, &model.Enclosure{ + URL: mediaPeerLink.URL, + MimeType: mediaPeerLink.MimeType(), + Size: mediaPeerLink.Size(), + }) + } + } + + return enclosures +} diff --git a/internal/reader/rss/feedburner.go b/internal/reader/rss/feedburner.go index 7b112dfa..e4a342da 100644 --- a/internal/reader/rss/feedburner.go +++ b/internal/reader/rss/feedburner.go @@ -3,8 +3,8 @@ package rss // import "miniflux.app/v2/internal/reader/rss" -// FeedBurnerElement represents FeedBurner XML elements. -type FeedBurnerElement struct { +// FeedBurnerItemElement represents FeedBurner XML elements. +type FeedBurnerItemElement struct { FeedBurnerLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` FeedBurnerEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` } diff --git a/internal/reader/rss/parser.go b/internal/reader/rss/parser.go index 55122ea4..92f64f92 100644 --- a/internal/reader/rss/parser.go +++ b/internal/reader/rss/parser.go @@ -13,11 +13,11 @@ import ( // Parse returns a normalized feed struct from a RSS feed. func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) { - feed := new(rssFeed) + rssFeed := new(RSS) decoder := xml.NewXMLDecoder(data) decoder.DefaultSpace = "rss" - if err := decoder.Decode(feed); err != nil { + if err := decoder.Decode(rssFeed); err != nil { return nil, fmt.Errorf("rss: unable to parse feed: %w", err) } - return feed.Transform(baseURL), nil + return NewRSSAdapter(rssFeed).BuildFeed(baseURL), nil } diff --git a/internal/reader/rss/parser_test.go b/internal/reader/rss/parser_test.go index e4ff09ed..41b36e8e 100644 --- a/internal/reader/rss/parser_test.go +++ b/internal/reader/rss/parser_test.go @@ -846,6 +846,59 @@ func TestParseEntryWithEnclosures(t *testing.T) { } } +func TestParseEntryWithIncorrectEnclosureLength(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <title>My Podcast Feed</title> + <link>http://example.org</link> + <author>some.email@example.org</author> + <item> + <title>Podcasting with RSS</title> + <link>http://www.example.org/entries/1</link> + <description>An overview of RSS podcasting</description> + <pubDate>Fri, 15 Jul 2005 00:00:00 -0500</pubDate> + <guid isPermaLink="true">http://www.example.org/entries/1</guid> + <enclosure url="http://www.example.org/myaudiofile.mp3" length="invalid" type="audio/mpeg" /> + <enclosure url="http://www.example.org/myaudiofile.wav" length=" " type="audio" /> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if len(feed.Entries[0].Enclosures) != 2 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://www.example.org/myaudiofile.mp3" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/mpeg" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 0 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } + + if feed.Entries[0].Enclosures[1].Size != 0 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} + func TestParseEntryWithEmptyEnclosureURL(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rss version="2.0"> @@ -1306,6 +1359,60 @@ func TestParseEntryWithMediaPeerLink(t *testing.T) { } } +func TestParseItunesDuration(t *testing.T) { + data := `<?xml version="1.0" encoding="UTF-8"?> + <rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"> + <channel> + <title>Podcast Example</title> + <link>http://www.example.com/index.html</link> + <item> + <title>Podcast Episode</title> + <guid>http://example.com/episode.m4a</guid> + <pubDate>Tue, 08 Mar 2016 12:00:00 GMT</pubDate> + <itunes:duration>1:23:45</itunes:duration> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + expected := 83 + result := feed.Entries[0].ReadingTime + if expected != result { + t.Errorf(`Unexpected podcast duration, got %d instead of %d`, result, expected) + } +} + +func TestParseIncorrectItunesDuration(t *testing.T) { + data := `<?xml version="1.0" encoding="UTF-8"?> + <rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"> + <channel> + <title>Podcast Example</title> + <link>http://www.example.com/index.html</link> + <item> + <title>Podcast Episode</title> + <guid>http://example.com/episode.m4a</guid> + <pubDate>Tue, 08 Mar 2016 12:00:00 GMT</pubDate> + <itunes:duration>invalid</itunes:duration> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + expected := 0 + result := feed.Entries[0].ReadingTime + if expected != result { + t.Errorf(`Unexpected podcast duration, got %d instead of %d`, result, expected) + } +} + func TestEntryDescriptionFromItunesSummary(t *testing.T) { data := `<?xml version="1.0" encoding="UTF-8"?> <rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"> diff --git a/internal/reader/rss/podcast.go b/internal/reader/rss/podcast.go index 9a1f365b..7fd93f4a 100644 --- a/internal/reader/rss/podcast.go +++ b/internal/reader/rss/podcast.go @@ -12,8 +12,7 @@ import ( var ErrInvalidDurationFormat = errors.New("rss: invalid duration format") -// normalizeDuration returns the duration tag value as a number of minutes -func normalizeDuration(rawDuration string) (int, error) { +func getDurationInMinutes(rawDuration string) (int, error) { var sumSeconds int durationParts := strings.Split(rawDuration, ":") diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index be53c4b0..7935166d 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -5,391 +5,110 @@ package rss // import "miniflux.app/v2/internal/reader/rss" import ( "encoding/xml" - "html" - "log/slog" - "path" "strconv" "strings" - "time" - "miniflux.app/v2/internal/crypto" - "miniflux.app/v2/internal/model" - "miniflux.app/v2/internal/reader/date" "miniflux.app/v2/internal/reader/dublincore" "miniflux.app/v2/internal/reader/googleplay" "miniflux.app/v2/internal/reader/itunes" "miniflux.app/v2/internal/reader/media" - "miniflux.app/v2/internal/reader/sanitizer" - "miniflux.app/v2/internal/urllib" ) // Specs: https://www.rssboard.org/rss-specification -type rssFeed struct { - XMLName xml.Name `xml:"rss"` +type RSS struct { Version string `xml:"rss version,attr"` - Channel rssChannel `xml:"rss channel"` + Channel RSSChannel `xml:"rss channel"` } -type rssChannel struct { - Categories []string `xml:"rss category"` +type RSSChannel struct { Title string `xml:"rss title"` Link string `xml:"rss link"` - ImageURL string `xml:"rss image>url"` - Language string `xml:"rss language"` Description string `xml:"rss description"` - PubDate string `xml:"rss pubDate"` + Language string `xml:"rss language"` + Copyright string `xml:"rss copyRight"` ManagingEditor string `xml:"rss managingEditor"` Webmaster string `xml:"rss webMaster"` - TimeToLive rssTTL `xml:"rss ttl"` - Items []rssItem `xml:"rss item"` + PubDate string `xml:"rss pubDate"` + LastBuildDate string `xml:"rss lastBuildDate"` + Categories []string `xml:"rss category"` + Generator string `xml:"rss generator"` + Docs string `xml:"rss docs"` + Cloud *RSSCloud `xml:"rss cloud"` + Image *RSSImage `xml:"rss image"` + TTL string `xml:"rss ttl"` + SkipHours []string `xml:"rss skipHours>hour"` + SkipDays []string `xml:"rss skipDays>day"` + Items []RSSItem `xml:"rss item"` AtomLinks - itunes.ItunesFeedElement - googleplay.GooglePlayFeedElement -} - -type rssTTL struct { - Data string `xml:",chardata"` -} - -func (r *rssTTL) Value() int { - if r.Data == "" { - return 0 - } - - value, err := strconv.Atoi(r.Data) - if err != nil { - return 0 - } - - return value + itunes.ItunesChannelElement + googleplay.GooglePlayChannelElement } -func (r *rssFeed) Transform(baseURL string) *model.Feed { - var err error - - feed := new(model.Feed) - - siteURL := r.siteURL() - feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL) - if err != nil { - feed.SiteURL = siteURL - } - - feedURL := r.feedURL() - feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL) - if err != nil { - feed.FeedURL = feedURL - } - - feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title)) - if feed.Title == "" { - feed.Title = feed.SiteURL - } - - feed.IconURL = strings.TrimSpace(r.Channel.ImageURL) - feed.TTL = r.Channel.TimeToLive.Value() - - for _, item := range r.Channel.Items { - entry := item.Transform() - if entry.Author == "" { - entry.Author = r.feedAuthor() - } - - if entry.URL == "" { - entry.URL = feed.SiteURL - } else { - entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL) - if err == nil { - entry.URL = entryURL - } - } - - if entry.Title == "" { - entry.Title = sanitizer.TruncateHTML(entry.Content, 100) - } - - if entry.Title == "" { - entry.Title = entry.URL - } - - entry.Tags = append(entry.Tags, r.Channel.Categories...) - entry.Tags = append(entry.Tags, r.Channel.GetItunesCategories()...) - - if r.Channel.GooglePlayCategory.Text != "" { - entry.Tags = append(entry.Tags, r.Channel.GooglePlayCategory.Text) - } - - feed.Entries = append(feed.Entries, entry) - } - - return feed +type RSSCloud struct { + Domain string `xml:"domain,attr"` + Port string `xml:"port,attr"` + Path string `xml:"path,attr"` + RegisterProcedure string `xml:"registerProcedure,attr"` + Protocol string `xml:"protocol,attr"` } -func (r *rssFeed) siteURL() string { - return strings.TrimSpace(r.Channel.Link) -} +type RSSImage struct { + // URL is the URL of a GIF, JPEG or PNG image that represents the channel. + URL string `xml:"url"` -func (r *rssFeed) feedURL() string { - for _, atomLink := range r.Channel.AtomLinks.Links { - if atomLink.Rel == "self" { - return strings.TrimSpace(atomLink.URL) - } - } - return "" -} + // Title describes the image, it's used in the ALT attribute of the HTML <img> tag when the channel is rendered in HTML. + Title string `xml:"title"` -func (r rssFeed) feedAuthor() string { - var author string - switch { - case r.Channel.ItunesAuthor != "": - author = r.Channel.ItunesAuthor - case r.Channel.GooglePlayAuthor != "": - author = r.Channel.GooglePlayAuthor - case r.Channel.ItunesOwner.String() != "": - author = r.Channel.ItunesOwner.String() - case r.Channel.ManagingEditor != "": - author = r.Channel.ManagingEditor - case r.Channel.Webmaster != "": - author = r.Channel.Webmaster - } - return sanitizer.StripTags(strings.TrimSpace(author)) + // Link is the URL of the site, when the channel is rendered, the image is a link to the site. + Link string `xml:"link"` } -type rssGUID struct { - XMLName xml.Name - Data string `xml:",chardata"` - IsPermaLink string `xml:"isPermaLink,attr"` +type RSSItem struct { + Title string `xml:"rss title"` + Link string `xml:"rss link"` + Description string `xml:"rss description"` + Author RSSAuthor `xml:"rss author"` + Categories []string `xml:"rss category"` + CommentsURL string `xml:"rss comments"` + Enclosures []RSSEnclosure `xml:"rss enclosure"` + GUID RSSGUID `xml:"rss guid"` + PubDate string `xml:"rss pubDate"` + Source RSSSource `xml:"rss source"` + dublincore.DublinCoreItemElement + FeedBurnerItemElement + media.MediaItemElement + AtomAuthor + AtomLinks + itunes.ItunesItemElement + googleplay.GooglePlayItemElement } -type rssAuthor struct { +type RSSAuthor struct { XMLName xml.Name Data string `xml:",chardata"` Inner string `xml:",innerxml"` } -type rssEnclosure struct { +type RSSEnclosure struct { URL string `xml:"url,attr"` Type string `xml:"type,attr"` Length string `xml:"length,attr"` } -func (enclosure *rssEnclosure) Size() int64 { - if enclosure.Length == "" { +func (enclosure *RSSEnclosure) Size() int64 { + if strings.TrimSpace(enclosure.Length) == "" { return 0 } size, _ := strconv.ParseInt(enclosure.Length, 10, 0) return size } -type rssItem struct { - GUID rssGUID `xml:"rss guid"` - Title string `xml:"rss title"` - Link string `xml:"rss link"` - Description string `xml:"rss description"` - PubDate string `xml:"rss pubDate"` - Author rssAuthor `xml:"rss author"` - Comments string `xml:"rss comments"` - EnclosureLinks []rssEnclosure `xml:"rss enclosure"` - Categories []string `xml:"rss category"` - dublincore.DublinCoreItemElement - FeedBurnerElement - media.Element - AtomAuthor - AtomLinks - itunes.ItunesItemElement - googleplay.GooglePlayItemElement -} - -func (r *rssItem) Transform() *model.Entry { - entry := model.NewEntry() - entry.URL = r.entryURL() - entry.CommentsURL = r.entryCommentsURL() - entry.Date = r.entryDate() - entry.Author = r.entryAuthor() - entry.Hash = r.entryHash() - entry.Content = r.entryContent() - entry.Title = r.entryTitle() - entry.Enclosures = r.entryEnclosures() - entry.Tags = r.Categories - if duration, err := normalizeDuration(r.ItunesDuration); err == nil { - entry.ReadingTime = duration - } - - return entry -} - -func (r *rssItem) entryDate() time.Time { - value := r.PubDate - if r.DublinCoreDate != "" { - value = r.DublinCoreDate - } - - if value != "" { - result, err := date.Parse(value) - if err != nil { - slog.Debug("Unable to parse date from RSS feed", - slog.String("date", value), - slog.String("guid", r.GUID.Data), - slog.Any("error", err), - ) - return time.Now() - } - - return result - } - - return time.Now() -} - -func (r *rssItem) entryAuthor() string { - var author string - - switch { - case r.GooglePlayAuthor != "": - author = r.GooglePlayAuthor - case r.ItunesAuthor != "": - author = r.ItunesAuthor - case r.DublinCoreCreator != "": - author = r.DublinCoreCreator - case r.AtomAuthor.String() != "": - author = r.AtomAuthor.String() - case strings.Contains(r.Author.Inner, "<![CDATA["): - author = r.Author.Data - default: - author = r.Author.Inner - } - - return strings.TrimSpace(sanitizer.StripTags(author)) -} - -func (r *rssItem) entryHash() string { - for _, value := range []string{r.GUID.Data, r.entryURL()} { - if value != "" { - return crypto.Hash(value) - } - } - - return "" -} - -func (r *rssItem) entryTitle() string { - title := r.Title - - if r.DublinCoreTitle != "" { - title = r.DublinCoreTitle - } - - return html.UnescapeString(strings.TrimSpace(title)) -} - -func (r *rssItem) entryContent() string { - for _, value := range []string{ - r.DublinCoreContent, - r.Description, - r.GooglePlayDescription, - r.ItunesSummary, - r.ItunesSubtitle, - } { - if value != "" { - return value - } - } - return "" -} - -func (r *rssItem) entryURL() string { - for _, link := range []string{r.FeedBurnerLink, r.Link} { - if link != "" { - return strings.TrimSpace(link) - } - } - - for _, atomLink := range r.AtomLinks.Links { - if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { - return strings.TrimSpace(atomLink.URL) - } - } - - // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt - // isPermaLink is optional, its default value is true. - // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular. - if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" { - return strings.TrimSpace(r.GUID.Data) - } - - return "" -} - -func (r *rssItem) entryEnclosures() model.EnclosureList { - enclosures := make(model.EnclosureList, 0) - duplicates := make(map[string]bool) - - for _, mediaThumbnail := range r.AllMediaThumbnails() { - if _, found := duplicates[mediaThumbnail.URL]; !found { - duplicates[mediaThumbnail.URL] = true - enclosures = append(enclosures, &model.Enclosure{ - URL: mediaThumbnail.URL, - MimeType: mediaThumbnail.MimeType(), - Size: mediaThumbnail.Size(), - }) - } - } - - for _, enclosure := range r.EnclosureLinks { - enclosureURL := enclosure.URL - - if r.FeedBurnerEnclosureLink != "" { - filename := path.Base(r.FeedBurnerEnclosureLink) - if strings.Contains(enclosureURL, filename) { - enclosureURL = r.FeedBurnerEnclosureLink - } - } - - if enclosureURL == "" { - continue - } - - if _, found := duplicates[enclosureURL]; !found { - duplicates[enclosureURL] = true - - enclosures = append(enclosures, &model.Enclosure{ - URL: enclosureURL, - MimeType: enclosure.Type, - Size: enclosure.Size(), - }) - } - } - - for _, mediaContent := range r.AllMediaContents() { - if _, found := duplicates[mediaContent.URL]; !found { - duplicates[mediaContent.URL] = true - enclosures = append(enclosures, &model.Enclosure{ - URL: mediaContent.URL, - MimeType: mediaContent.MimeType(), - Size: mediaContent.Size(), - }) - } - } - - for _, mediaPeerLink := range r.AllMediaPeerLinks() { - if _, found := duplicates[mediaPeerLink.URL]; !found { - duplicates[mediaPeerLink.URL] = true - enclosures = append(enclosures, &model.Enclosure{ - URL: mediaPeerLink.URL, - MimeType: mediaPeerLink.MimeType(), - Size: mediaPeerLink.Size(), - }) - } - } - - return enclosures +type RSSGUID struct { + Data string `xml:",chardata"` + IsPermaLink string `xml:"isPermaLink,attr"` } -func (r *rssItem) entryCommentsURL() string { - commentsURL := strings.TrimSpace(r.Comments) - if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) { - return commentsURL - } - - return "" +type RSSSource struct { + URL string `xml:"url,attr"` + Name string `xml:",chardata"` } |