summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Alexander Batischev <eual.jp@gmail.com> 2024-03-22 22:32:13 +0300
committerGravatar GitHub <noreply@github.com> 2024-03-22 22:32:13 +0300
commit1e4a28928f8e6ef6b6809d30232824b80c787e9b (patch)
tree1d0ac3681e6e84d978448329fb0fc189c49805c4
parent70c0aa54f32ad611bc636430117e12b962449c8c (diff)
parent24724915d44f5d2e72719b96bd7807cc17a43cc2 (diff)
downloadnewsboat-1e4a28928f8e6ef6b6809d30232824b80c787e9b.tar.gz
newsboat-1e4a28928f8e6ef6b6809d30232824b80c787e9b.tar.zst
newsboat-1e4a28928f8e6ef6b6809d30232824b80c787e9b.zip
Merge pull request #2706 from danieloh0714/optimize-rssignores-matches
optimize RssIgnores::matches function
-rw-r--r--include/rssignores.h6
-rw-r--r--src/matcher.cpp4
-rw-r--r--src/rssignores.cpp66
-rw-r--r--test/rssignores.cpp31
4 files changed, 74 insertions, 33 deletions
diff --git a/include/rssignores.h b/include/rssignores.h
index f38aa90c..dce190e5 100644
--- a/include/rssignores.h
+++ b/include/rssignores.h
@@ -2,6 +2,7 @@
#define NEWSBOAT_RSSIGNORES_H_
#include <string>
+#include <unordered_map>
#include <vector>
#include "configactionhandler.h"
@@ -24,7 +25,10 @@ public:
bool matches_resetunread(const std::string& url);
private:
- std::vector<FeedUrlExprPair> ignores;
+ bool matches_expr(std::shared_ptr<Matcher> expr, RssItem* item);
+
+ std::vector<FeedUrlExprPair> regex_ignores;
+ std::unordered_multimap<std::string, std::shared_ptr<Matcher>> non_regex_ignores;
std::vector<std::string> ignores_lastmodified;
std::vector<std::string> resetflag;
diff --git a/src/matcher.cpp b/src/matcher.cpp
index ff162787..7f9bae74 100644
--- a/src/matcher.cpp
+++ b/src/matcher.cpp
@@ -63,7 +63,7 @@ bool Matcher::matches(Matchable* item)
* you only have to implement the method attribute_value().
*
* The whole matching code is speed-critical, as the matching happens on
- * a lot of different occassions, and slow matching can be easily
+ * a lot of different occasions, and slow matching can be easily
* measured (and felt by the user) on slow computers with a lot of items
* to match.
*/
@@ -194,7 +194,7 @@ bool Matcher::matches_r(expression* e, Matchable* item)
return matches_r(e->l, item) ||
matches_r(e->r, item); // same here
- /* while the other operator connect an attribute with a value */
+ /* while the other operators connect an attribute with a value */
case MATCHOP_EQ:
return matchop_eq(e, item);
diff --git a/src/rssignores.cpp b/src/rssignores.cpp
index 14c7068a..4bc77f95 100644
--- a/src/rssignores.cpp
+++ b/src/rssignores.cpp
@@ -58,9 +58,11 @@ void RssIgnores::handle_action(const std::string& action,
_("`%s' is not a valid regular expression: %s"),
pattern, errorMessage));
}
- }
- ignores.push_back(FeedUrlExprPair(ignore_rssurl, m));
+ regex_ignores.push_back(FeedUrlExprPair(pattern, m));
+ } else {
+ non_regex_ignores.insert({ignore_rssurl, m});
+ }
} else if (action == "always-download") {
if (params.empty()) {
throw ConfigHandlerException(ActionHandlerStatus::TOO_FEW_PARAMS);
@@ -84,7 +86,7 @@ void RssIgnores::handle_action(const std::string& action,
void RssIgnores::dump_config(std::vector<std::string>& config_output) const
{
- for (const auto& ign : ignores) {
+ for (const auto& ign : non_regex_ignores) {
std::string configline = "ignore-article ";
if (ign.first == "*") {
configline.append("*");
@@ -95,6 +97,13 @@ void RssIgnores::dump_config(std::vector<std::string>& config_output) const
configline.append(utils::quote(ign.second->get_expression()));
config_output.push_back(configline);
}
+ for (const auto& ign : regex_ignores) {
+ std::string configline = "ignore-article ";
+ configline.append(utils::quote(REGEX_PREFIX + ign.first));
+ configline.append(" ");
+ configline.append(utils::quote(ign.second->get_expression()));
+ config_output.push_back(configline);
+ }
for (const auto& ign_lm : ignores_lastmodified) {
config_output.push_back(strprintf::fmt(
"always-download %s", utils::quote(ign_lm)));
@@ -105,32 +114,49 @@ void RssIgnores::dump_config(std::vector<std::string>& config_output) const
}
}
+bool RssIgnores::matches_expr(std::shared_ptr<Matcher> expr, RssItem* item)
+{
+ if (expr->matches(item)) {
+ LOG(Level::DEBUG,
+ "RssIgnores::matches_expr: found match");
+ return true;
+ }
+
+ return false;
+}
+
bool RssIgnores::matches(RssItem* item)
{
- const int prefix_len = REGEX_PREFIX.length();
- for (const auto& ign : ignores) {
- bool matched = false;
+ auto search = non_regex_ignores.equal_range(item->feedurl());
+ for (auto itr = search.first; itr != search.second; itr++) {
+ if (matches_expr(itr->second, item)) {
+ return true;
+ }
+ }
+
+ search = non_regex_ignores.equal_range("*");
+ for (auto itr = search.first; itr != search.second; itr++) {
+ if (matches_expr(itr->second, item)) {
+ return true;
+ }
+ }
+
+ for (const auto& ign : regex_ignores) {
+ const std::string pattern = ign.first;
+
LOG(Level::DEBUG,
"RssIgnores::matches: ign.first = `%s' item->feedurl = `%s'",
- ign.first,
+ pattern,
item->feedurl());
- if (!ign.first.compare(0, prefix_len, REGEX_PREFIX)) {
- const std::string pattern = ign.first.substr(prefix_len, ign.first.length() - prefix_len);
- std::string errorMessage;
- const auto regex = Regex::compile(pattern, REG_EXTENDED | REG_ICASE, errorMessage);
- const auto matches = regex->matches(item->feedurl(), 1, 0);
- matched = !matches.empty();
- } else {
- matched = ign.first == "*" || item->feedurl() == ign.first;
- }
-
- if (matched && ign.second->matches(item)) {
- LOG(Level::DEBUG,
- "RssIgnores::matches: found match");
+ std::string errorMessage;
+ const auto regex = Regex::compile(pattern, REG_EXTENDED | REG_ICASE, errorMessage);
+ const auto matches = regex->matches(item->feedurl(), 1, 0);
+ if (!matches.empty() && matches_expr(ign.second, item)) {
return true;
}
}
+
return false;
}
diff --git a/test/rssignores.cpp b/test/rssignores.cpp
index f9d33583..a58f3c8e 100644
--- a/test/rssignores.cpp
+++ b/test/rssignores.cpp
@@ -1,5 +1,7 @@
#include "rssignores.h"
+#include <set>
+
#include "3rd-party/catch.hpp"
#include "cache.h"
@@ -135,6 +137,7 @@ TEST_CASE("RssIgnores::dump_config() writes out all configured settings "
ignores.handle_action(action, {"https://example.com/feed.xml", "author =~ \"Joe\""});
ignores.handle_action(action, {"*", "title # \"interesting\""});
ignores.handle_action(action, {"https://blog.example.com/joe/posts.xml", "guid # 123"});
+ ignores.handle_action(action, {"regex:^https://.*", "author = \"John Doe\""});
std::vector<std::string> config;
const auto comment =
@@ -143,13 +146,18 @@ TEST_CASE("RssIgnores::dump_config() writes out all configured settings "
ignores.dump_config(config);
- REQUIRE(config.size() == 4); // three actions plus one comment
- REQUIRE(config[0] == comment);
- REQUIRE(config[1] ==
- R"#(ignore-article "https://example.com/feed.xml" "author =~ \"Joe\"")#");
- REQUIRE(config[2] == R"#(ignore-article * "title # \"interesting\"")#");
- REQUIRE(config[3] ==
- R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#");
+ std::set<std::string> config_set(config.begin(), config.end());
+
+ REQUIRE(config.size() == 5); // four actions plus one comment
+ REQUIRE(config_set.count(comment) == 1);
+ REQUIRE(config_set.count(
+ R"#(ignore-article "https://example.com/feed.xml" "author =~ \"Joe\"")#") == 1);
+ REQUIRE(config_set.count(
+ R"#(ignore-article * "title # \"interesting\"")#") == 1);
+ REQUIRE(config_set.count(
+ R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#") == 1);
+ REQUIRE(config_set.count(
+ R"#(ignore-article "regex:^https://.*" "author = \"John Doe\"")#") == 1);
}
SECTION("`always-download`") {
@@ -209,11 +217,14 @@ TEST_CASE("RssIgnores::dump_config() writes out all configured settings "
ignores.dump_config(config);
+ std::set<std::string> config_set(config.begin(), config.end());
+
REQUIRE(config.size() == 10);
REQUIRE(config[0] == comment);
- REQUIRE(config[1] == R"#(ignore-article * "title # \"interesting\"")#");
- REQUIRE(config[2] ==
- R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#");
+ REQUIRE(config_set.count(
+ R"#(ignore-article * "title # \"interesting\"")#") == 1);
+ REQUIRE(config_set.count(
+ R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#") == 1);
REQUIRE(config[3] == R"#(always-download "url1")#");
REQUIRE(config[4] == R"#(always-download "url2")#");
REQUIRE(config[5] == R"#(always-download "url3")#");