diff options
Diffstat (limited to 'internal/reader/atom/atom_10.go')
-rw-r--r-- | internal/reader/atom/atom_10.go | 413 |
1 files changed, 163 insertions, 250 deletions
diff --git a/internal/reader/atom/atom_10.go b/internal/reader/atom/atom_10.go index 798a8748..201d00d1 100644 --- a/internal/reader/atom/atom_10.go +++ b/internal/reader/atom/atom_10.go @@ -6,286 +6,199 @@ package atom // import "miniflux.app/v2/internal/reader/atom" import ( "encoding/xml" "html" - "log/slog" - "strconv" "strings" - "time" - "miniflux.app/v2/internal/crypto" - "miniflux.app/v2/internal/model" - "miniflux.app/v2/internal/reader/date" "miniflux.app/v2/internal/reader/media" "miniflux.app/v2/internal/reader/sanitizer" - "miniflux.app/v2/internal/urllib" ) +// The "atom:feed" element is the document (i.e., top-level) element of +// an Atom Feed Document, acting as a container for metadata and data +// associated with the feed. Its element children consist of metadata +// elements followed by zero or more atom:entry child elements. +// // Specs: // https://tools.ietf.org/html/rfc4287 // https://validator.w3.org/feed/docs/atom.html -type atom10Feed struct { - XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"` - ID string `xml:"id"` - Title atom10Text `xml:"title"` - Authors atomAuthors `xml:"author"` - Icon string `xml:"icon"` - Links atomLinks `xml:"link"` - Entries []atom10Entry `xml:"entry"` +type Atom10Feed struct { + XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"` + + // The "atom:id" element conveys a permanent, universally unique + // identifier for an entry or feed. + // + // Its content MUST be an IRI, as defined by [RFC3987]. Note that the + // definition of "IRI" excludes relative references. Though the IRI + // might use a dereferencable scheme, Atom Processors MUST NOT assume it + // can be dereferenced. + // + // atom:feed elements MUST contain exactly one atom:id element. + ID string `xml:"http://www.w3.org/2005/Atom id"` + + // The "atom:title" element is a Text construct that conveys a human- + // readable title for an entry or feed. + // + // atom:feed elements MUST contain exactly one atom:title element. + Title Atom10Text `xml:"http://www.w3.org/2005/Atom title"` + + // The "atom:author" element is a Person construct that indicates the + // author of the entry or feed. + // + // atom:feed elements MUST contain one or more atom:author elements, + // unless all of the atom:feed element's child atom:entry elements + // contain at least one atom:author element. + Authors AtomPersons `xml:"http://www.w3.org/2005/Atom author"` + + // The "atom:icon" element's content is an IRI reference [RFC3987] that + // identifies an image that provides iconic visual identification for a + // feed. + // + // atom:feed elements MUST NOT contain more than one atom:icon element. + Icon string `xml:"http://www.w3.org/2005/Atom icon"` + + // The "atom:logo" element's content is an IRI reference [RFC3987] that + // identifies an image that provides visual identification for a feed. + // + // atom:feed elements MUST NOT contain more than one atom:logo element. + Logo string `xml:"http://www.w3.org/2005/Atom logo"` + + // atom:feed elements SHOULD contain one atom:link element with a rel + // attribute value of "self". This is the preferred URI for + // retrieving Atom Feed Documents representing this Atom feed. + // + // atom:feed elements MUST NOT contain more than one atom:link + // element with a rel attribute value of "alternate" that has the + // same combination of type and hreflang attribute values. + Links AtomLinks `xml:"http://www.w3.org/2005/Atom link"` + + // The "atom:category" element conveys information about a category + // associated with an entry or feed. This specification assigns no + // meaning to the content (if any) of this element. + // + // atom:feed elements MAY contain any number of atom:category + // elements. + Categories AtomCategories `xml:"http://www.w3.org/2005/Atom category"` + + Entries []Atom10Entry `xml:"http://www.w3.org/2005/Atom entry"` } -func (a *atom10Feed) Transform(baseURL string) *model.Feed { - var err error +type Atom10Entry struct { + // The "atom:id" element conveys a permanent, universally unique + // identifier for an entry or feed. + // + // Its content MUST be an IRI, as defined by [RFC3987]. Note that the + // definition of "IRI" excludes relative references. Though the IRI + // might use a dereferencable scheme, Atom Processors MUST NOT assume it + // can be dereferenced. + // + // atom:entry elements MUST contain exactly one atom:id element. + ID string `xml:"http://www.w3.org/2005/Atom id"` + + // The "atom:title" element is a Text construct that conveys a human- + // readable title for an entry or feed. + // + // atom:entry elements MUST contain exactly one atom:title element. + Title Atom10Text `xml:"http://www.w3.org/2005/Atom title"` + + // The "atom:published" element is a Date construct indicating an + // instant in time associated with an event early in the life cycle of + // the entry. + Published string `xml:"http://www.w3.org/2005/Atom published"` + + // The "atom:updated" element is a Date construct indicating the most + // recent instant in time when an entry or feed was modified in a way + // the publisher considers significant. Therefore, not all + // modifications necessarily result in a changed atom:updated value. + // + // atom:entry elements MUST contain exactly one atom:updated element. + Updated string `xml:"http://www.w3.org/2005/Atom updated"` + + // atom:entry elements MUST NOT contain more than one atom:link + // element with a rel attribute value of "alternate" that has the + // same combination of type and hreflang attribute values. + Links AtomLinks `xml:"http://www.w3.org/2005/Atom link"` + + // atom:entry elements MUST contain an atom:summary element in either + // of the following cases: + // * the atom:entry contains an atom:content that has a "src" + // attribute (and is thus empty). + // * the atom:entry contains content that is encoded in Base64; + // i.e., the "type" attribute of atom:content is a MIME media type + // [MIMEREG], but is not an XML media type [RFC3023], does not + // begin with "text/", and does not end with "/xml" or "+xml". + // + // atom:entry elements MUST NOT contain more than one atom:summary + // element. + Summary Atom10Text `xml:"http://www.w3.org/2005/Atom summary"` + + // atom:entry elements MUST NOT contain more than one atom:content + // element. + Content Atom10Text `xml:"http://www.w3.org/2005/Atom content"` + + // The "atom:author" element is a Person construct that indicates the + // author of the entry or feed. + // + // atom:entry elements MUST contain one or more atom:author elements + Authors AtomPersons `xml:"http://www.w3.org/2005/Atom author"` + + // The "atom:category" element conveys information about a category + // associated with an entry or feed. This specification assigns no + // meaning to the content (if any) of this element. + // + // atom:entry elements MAY contain any number of atom:category + // elements. + Categories AtomCategories `xml:"http://www.w3.org/2005/Atom category"` - feed := new(model.Feed) - - feedURL := a.Links.firstLinkWithRelation("self") - feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL) - if err != nil { - feed.FeedURL = feedURL - } - - siteURL := a.Links.originalLink() - feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL) - if err != nil { - feed.SiteURL = siteURL - } - - feed.Title = html.UnescapeString(a.Title.String()) - if feed.Title == "" { - feed.Title = feed.SiteURL - } - - feed.IconURL = strings.TrimSpace(a.Icon) - - for _, entry := range a.Entries { - item := entry.Transform() - entryURL, err := urllib.AbsoluteURL(feed.SiteURL, item.URL) - if err == nil { - item.URL = entryURL - } - - if item.Author == "" { - item.Author = a.Authors.String() - } - - if item.Title == "" { - item.Title = sanitizer.TruncateHTML(item.Content, 100) - } - - if item.Title == "" { - item.Title = item.URL - } - - feed.Entries = append(feed.Entries, item) - } - - return feed -} - -type atom10Entry struct { - ID string `xml:"id"` - Title atom10Text `xml:"title"` - Published string `xml:"published"` - Updated string `xml:"updated"` - Links atomLinks `xml:"link"` - Summary atom10Text `xml:"summary"` - Content atom10Text `xml:"http://www.w3.org/2005/Atom content"` - Authors atomAuthors `xml:"author"` - Categories []atom10Category `xml:"category"` media.MediaItemElement } -func (a *atom10Entry) Transform() *model.Entry { - entry := model.NewEntry() - entry.URL = a.Links.originalLink() - entry.Date = a.entryDate() - entry.Author = a.Authors.String() - entry.Hash = a.entryHash() - entry.Content = a.entryContent() - entry.Title = a.entryTitle() - entry.Enclosures = a.entryEnclosures() - entry.CommentsURL = a.entryCommentsURL() - entry.Tags = a.entryCategories() - return entry -} - -func (a *atom10Entry) entryTitle() string { - return html.UnescapeString(a.Title.String()) -} - -func (a *atom10Entry) entryContent() string { - content := a.Content.String() - if content != "" { - return content - } - - summary := a.Summary.String() - if summary != "" { - return summary - } - - mediaDescription := a.FirstMediaDescription() - if mediaDescription != "" { - return mediaDescription - } - - return "" -} - -// Note: The published date represents the original creation date for YouTube feeds. -// Example: -// <published>2019-01-26T08:02:28+00:00</published> -// <updated>2019-01-29T07:27:27+00:00</updated> -func (a *atom10Entry) entryDate() time.Time { - dateText := a.Published - if dateText == "" { - dateText = a.Updated - } - - if dateText != "" { - result, err := date.Parse(dateText) - if err != nil { - slog.Debug("Unable to parse date from Atom 0.3 feed", - slog.String("date", dateText), - slog.String("id", a.ID), - slog.Any("error", err), - ) - return time.Now() - } - - return result - } - - return time.Now() -} - -func (a *atom10Entry) entryHash() string { - for _, value := range []string{a.ID, a.Links.originalLink()} { - if value != "" { - return crypto.Hash(value) - } - } - - return "" +// A Text construct contains human-readable text, usually in small +// quantities. The content of Text constructs is Language-Sensitive. +// Specs: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1 +// Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1 +// HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2 +// XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3 +type Atom10Text struct { + Type string `xml:"type,attr"` + CharData string `xml:",chardata"` + InnerXML string `xml:",innerxml"` + XHTMLRootElement AtomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"` } -func (a *atom10Entry) entryEnclosures() model.EnclosureList { - enclosures := make(model.EnclosureList, 0) - duplicates := make(map[string]bool) - - for _, mediaThumbnail := range a.AllMediaThumbnails() { - if _, found := duplicates[mediaThumbnail.URL]; !found { - duplicates[mediaThumbnail.URL] = true - enclosures = append(enclosures, &model.Enclosure{ - URL: mediaThumbnail.URL, - MimeType: mediaThumbnail.MimeType(), - Size: mediaThumbnail.Size(), - }) - } - } - - for _, link := range a.Links { - if strings.EqualFold(link.Rel, "enclosure") { - if link.URL == "" { - continue - } - - if _, found := duplicates[link.URL]; !found { - duplicates[link.URL] = true - length, _ := strconv.ParseInt(link.Length, 10, 0) - enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length}) - } - } - } - - for _, mediaContent := range a.AllMediaContents() { - if _, found := duplicates[mediaContent.URL]; !found { - duplicates[mediaContent.URL] = true - enclosures = append(enclosures, &model.Enclosure{ - URL: mediaContent.URL, - MimeType: mediaContent.MimeType(), - Size: mediaContent.Size(), - }) - } - } +func (a *Atom10Text) Body() string { + var content string - for _, mediaPeerLink := range a.AllMediaPeerLinks() { - if _, found := duplicates[mediaPeerLink.URL]; !found { - duplicates[mediaPeerLink.URL] = true - enclosures = append(enclosures, &model.Enclosure{ - URL: mediaPeerLink.URL, - MimeType: mediaPeerLink.MimeType(), - Size: mediaPeerLink.Size(), - }) - } + if strings.EqualFold(a.Type, "xhtml") { + content = a.xhtmlContent() + } else { + content = a.CharData } - return enclosures + return strings.TrimSpace(content) } -func (r *atom10Entry) entryCategories() []string { - categoryList := make([]string, 0) - - for _, atomCategory := range r.Categories { - if strings.TrimSpace(atomCategory.Label) != "" { - categoryList = append(categoryList, strings.TrimSpace(atomCategory.Label)) - } else { - categoryList = append(categoryList, strings.TrimSpace(atomCategory.Term)) - } - } - - return categoryList -} +func (a *Atom10Text) Title() string { + var content string -// See https://tools.ietf.org/html/rfc4685#section-4 -// If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml". -// We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS. -func (a *atom10Entry) entryCommentsURL() string { - commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml") - if urllib.IsAbsoluteURL(commentsURL) { - return commentsURL + if strings.EqualFold(a.Type, "xhtml") { + content = a.xhtmlContent() + } else if strings.Contains(a.InnerXML, "<![CDATA[") { + content = html.UnescapeString(a.CharData) + } else { + content = a.CharData } - return "" -} - -type atom10Text struct { - Type string `xml:"type,attr"` - CharData string `xml:",chardata"` - InnerXML string `xml:",innerxml"` - XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"` -} -type atom10Category struct { - Term string `xml:"term,attr"` - Label string `xml:"label,attr"` + content = sanitizer.StripTags(content) + return strings.TrimSpace(content) } -// Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1 -// HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2 -// XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3 -func (a *atom10Text) String() string { - var content string - switch { - case a.Type == "", a.Type == "text", a.Type == "text/plain": - if strings.HasPrefix(strings.TrimSpace(a.InnerXML), `<![CDATA[`) { - content = html.EscapeString(a.CharData) - } else { - content = a.InnerXML - } - case a.Type == "xhtml": - var root = a.XHTMLRootElement - if root.XMLName.Local == "div" { - content = root.InnerXML - } else { - content = a.InnerXML - } - default: - content = a.CharData +func (a *Atom10Text) xhtmlContent() string { + if a.XHTMLRootElement.XMLName.Local == "div" { + return a.XHTMLRootElement.InnerXML } - - return strings.TrimSpace(content) + return a.InnerXML } -type atomXHTMLRootElement struct { +type AtomXHTMLRootElement struct { XMLName xml.Name `xml:"div"` InnerXML string `xml:",innerxml"` } |