aboutsummaryrefslogtreecommitdiff
path: root/internal/reader/rss/adapter.go
diff options
context:
space:
mode:
Diffstat (limited to 'internal/reader/rss/adapter.go')
-rw-r--r--internal/reader/rss/adapter.go310
1 files changed, 310 insertions, 0 deletions
diff --git a/internal/reader/rss/adapter.go b/internal/reader/rss/adapter.go
new file mode 100644
index 00000000..5c1785a9
--- /dev/null
+++ b/internal/reader/rss/adapter.go
@@ -0,0 +1,310 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package rss // import "miniflux.app/v2/internal/reader/rss"
+
+import (
+ "html"
+ "log/slog"
+ "path"
+ "strconv"
+ "strings"
+ "time"
+
+ "miniflux.app/v2/internal/crypto"
+ "miniflux.app/v2/internal/model"
+ "miniflux.app/v2/internal/reader/date"
+ "miniflux.app/v2/internal/reader/sanitizer"
+ "miniflux.app/v2/internal/urllib"
+)
+
+type RSSAdapter struct {
+ rss *RSS
+}
+
+func NewRSSAdapter(rss *RSS) *RSSAdapter {
+ return &RSSAdapter{rss}
+}
+
+func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
+ feed := &model.Feed{
+ Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
+ FeedURL: feedURL,
+ SiteURL: r.rss.Channel.Link,
+ }
+
+ if siteURL, err := urllib.AbsoluteURL(feedURL, r.rss.Channel.Link); err == nil {
+ feed.SiteURL = siteURL
+ }
+
+ // Try to find the feed URL from the Atom links.
+ for _, atomLink := range r.rss.Channel.AtomLinks.Links {
+ atomLinkHref := strings.TrimSpace(atomLink.URL)
+ if atomLinkHref != "" && atomLink.Rel == "self" {
+ if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil {
+ feed.FeedURL = absoluteFeedURL
+ break
+ }
+ }
+ }
+
+ // Fallback to the site URL if the title is empty.
+ if feed.Title == "" {
+ feed.Title = feed.SiteURL
+ }
+
+ // Get TTL if defined.
+ if r.rss.Channel.TTL != "" {
+ if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
+ feed.TTL = ttl
+ }
+ }
+
+ // Get the feed icon URL if defined.
+ if r.rss.Channel.Image != nil {
+ if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
+ feed.IconURL = absoluteIconURL
+ }
+ }
+
+ for _, item := range r.rss.Channel.Items {
+ entry := model.NewEntry()
+ entry.Author = findEntryAuthor(&item)
+ entry.Date = findEntryDate(&item)
+ entry.Content = findEntryContent(&item)
+ entry.Enclosures = findEntryEnclosures(&item)
+
+ // Populate the entry URL.
+ entryURL := findEntryURL(&item)
+ if entryURL == "" {
+ entry.URL = feed.SiteURL
+ } else {
+ if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entryURL); err == nil {
+ entry.URL = absoluteEntryURL
+ } else {
+ entry.URL = entryURL
+ }
+ }
+
+ // Populate the entry title.
+ entry.Title = findEntryTitle(&item)
+ if entry.Title == "" {
+ entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
+ }
+
+ if entry.Title == "" {
+ entry.Title = entry.URL
+ }
+
+ if entry.Author == "" {
+ entry.Author = findFeedAuthor(&r.rss.Channel)
+ }
+
+ // Generate the entry hash.
+ for _, value := range []string{item.GUID.Data, entryURL} {
+ if value != "" {
+ entry.Hash = crypto.Hash(value)
+ break
+ }
+ }
+
+ // Find CommentsURL if defined.
+ if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
+ entry.CommentsURL = absoluteCommentsURL
+ }
+
+ // Set podcast listening time.
+ if item.ItunesDuration != "" {
+ if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
+ entry.ReadingTime = duration
+ }
+ }
+
+ // Populate entry categories.
+ entry.Tags = append(entry.Tags, item.Categories...)
+ entry.Tags = append(entry.Tags, r.rss.Channel.Categories...)
+ entry.Tags = append(entry.Tags, r.rss.Channel.GetItunesCategories()...)
+
+ if r.rss.Channel.GooglePlayCategory.Text != "" {
+ entry.Tags = append(entry.Tags, r.rss.Channel.GooglePlayCategory.Text)
+ }
+
+ feed.Entries = append(feed.Entries, entry)
+ }
+
+ return feed
+}
+
+func findFeedAuthor(rssChannel *RSSChannel) string {
+ var author string
+ switch {
+ case rssChannel.ItunesAuthor != "":
+ author = rssChannel.ItunesAuthor
+ case rssChannel.GooglePlayAuthor != "":
+ author = rssChannel.GooglePlayAuthor
+ case rssChannel.ItunesOwner.String() != "":
+ author = rssChannel.ItunesOwner.String()
+ case rssChannel.ManagingEditor != "":
+ author = rssChannel.ManagingEditor
+ case rssChannel.Webmaster != "":
+ author = rssChannel.Webmaster
+ }
+ return sanitizer.StripTags(strings.TrimSpace(author))
+}
+
+func findEntryTitle(rssItem *RSSItem) string {
+ title := rssItem.Title
+
+ if rssItem.DublinCoreTitle != "" {
+ title = rssItem.DublinCoreTitle
+ }
+
+ return html.UnescapeString(strings.TrimSpace(title))
+}
+
+func findEntryURL(rssItem *RSSItem) string {
+ for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
+ if link != "" {
+ return strings.TrimSpace(link)
+ }
+ }
+
+ for _, atomLink := range rssItem.AtomLinks.Links {
+ if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
+ return strings.TrimSpace(atomLink.URL)
+ }
+ }
+
+ // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
+ // isPermaLink is optional, its default value is true.
+ // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
+ if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
+ return strings.TrimSpace(rssItem.GUID.Data)
+ }
+
+ return ""
+}
+
+func findEntryContent(rssItem *RSSItem) string {
+ for _, value := range []string{
+ rssItem.DublinCoreContent,
+ rssItem.Description,
+ rssItem.GooglePlayDescription,
+ rssItem.ItunesSummary,
+ rssItem.ItunesSubtitle,
+ } {
+ if value != "" {
+ return value
+ }
+ }
+ return ""
+}
+
+func findEntryDate(rssItem *RSSItem) time.Time {
+ value := rssItem.PubDate
+ if rssItem.DublinCoreDate != "" {
+ value = rssItem.DublinCoreDate
+ }
+
+ if value != "" {
+ result, err := date.Parse(value)
+ if err != nil {
+ slog.Debug("Unable to parse date from RSS feed",
+ slog.String("date", value),
+ slog.String("guid", rssItem.GUID.Data),
+ slog.Any("error", err),
+ )
+ return time.Now()
+ }
+
+ return result
+ }
+
+ return time.Now()
+}
+
+func findEntryAuthor(rssItem *RSSItem) string {
+ var author string
+
+ switch {
+ case rssItem.GooglePlayAuthor != "":
+ author = rssItem.GooglePlayAuthor
+ case rssItem.ItunesAuthor != "":
+ author = rssItem.ItunesAuthor
+ case rssItem.DublinCoreCreator != "":
+ author = rssItem.DublinCoreCreator
+ case rssItem.AtomAuthor.String() != "":
+ author = rssItem.AtomAuthor.String()
+ case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
+ author = rssItem.Author.Data
+ default:
+ author = rssItem.Author.Inner
+ }
+
+ return strings.TrimSpace(sanitizer.StripTags(author))
+}
+
+func findEntryEnclosures(rssItem *RSSItem) model.EnclosureList {
+ enclosures := make(model.EnclosureList, 0)
+ duplicates := make(map[string]bool)
+
+ for _, mediaThumbnail := range rssItem.AllMediaThumbnails() {
+ if _, found := duplicates[mediaThumbnail.URL]; !found {
+ duplicates[mediaThumbnail.URL] = true
+ enclosures = append(enclosures, &model.Enclosure{
+ URL: mediaThumbnail.URL,
+ MimeType: mediaThumbnail.MimeType(),
+ Size: mediaThumbnail.Size(),
+ })
+ }
+ }
+
+ for _, enclosure := range rssItem.Enclosures {
+ enclosureURL := enclosure.URL
+
+ if rssItem.FeedBurnerEnclosureLink != "" {
+ filename := path.Base(rssItem.FeedBurnerEnclosureLink)
+ if strings.Contains(enclosureURL, filename) {
+ enclosureURL = rssItem.FeedBurnerEnclosureLink
+ }
+ }
+
+ if enclosureURL == "" {
+ continue
+ }
+
+ if _, found := duplicates[enclosureURL]; !found {
+ duplicates[enclosureURL] = true
+
+ enclosures = append(enclosures, &model.Enclosure{
+ URL: enclosureURL,
+ MimeType: enclosure.Type,
+ Size: enclosure.Size(),
+ })
+ }
+ }
+
+ for _, mediaContent := range rssItem.AllMediaContents() {
+ if _, found := duplicates[mediaContent.URL]; !found {
+ duplicates[mediaContent.URL] = true
+ enclosures = append(enclosures, &model.Enclosure{
+ URL: mediaContent.URL,
+ MimeType: mediaContent.MimeType(),
+ Size: mediaContent.Size(),
+ })
+ }
+ }
+
+ for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() {
+ if _, found := duplicates[mediaPeerLink.URL]; !found {
+ duplicates[mediaPeerLink.URL] = true
+ enclosures = append(enclosures, &model.Enclosure{
+ URL: mediaPeerLink.URL,
+ MimeType: mediaPeerLink.MimeType(),
+ Size: mediaPeerLink.Size(),
+ })
+ }
+ }
+
+ return enclosures
+}