aboutsummaryrefslogtreecommitdiff
path: root/internal/reader/scraper/scraper.go
diff options
context:
space:
mode:
Diffstat (limited to 'internal/reader/scraper/scraper.go')
-rw-r--r--internal/reader/scraper/scraper.go13
1 files changed, 11 insertions, 2 deletions
diff --git a/internal/reader/scraper/scraper.go b/internal/reader/scraper/scraper.go
index 8508761f..4aabff48 100644
--- a/internal/reader/scraper/scraper.go
+++ b/internal/reader/scraper/scraper.go
@@ -10,6 +10,7 @@ import (
"strings"
"miniflux.app/v2/internal/config"
+ "miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readability"
"miniflux.app/v2/internal/urllib"
@@ -41,17 +42,25 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules str
var content string
var err error
+ htmlDocumentReader, err := encoding.CharsetReaderFromContentType(
+ responseHandler.ContentType(),
+ responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
+ )
+ if err != nil {
+ return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
+ }
+
if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules",
"url", websiteURL,
"rules", rules,
)
- content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules)
+ content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
} else {
slog.Debug("Extracting content with readability",
"url", websiteURL,
)
- content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
+ content, err = readability.ExtractContent(htmlDocumentReader)
}
if err != nil {