diff options
author | 2024-03-22 22:32:13 +0300 | |
---|---|---|
committer | 2024-03-22 22:32:13 +0300 | |
commit | 1e4a28928f8e6ef6b6809d30232824b80c787e9b (patch) | |
tree | 1d0ac3681e6e84d978448329fb0fc189c49805c4 | |
parent | 70c0aa54f32ad611bc636430117e12b962449c8c (diff) | |
parent | 24724915d44f5d2e72719b96bd7807cc17a43cc2 (diff) | |
download | newsboat-1e4a28928f8e6ef6b6809d30232824b80c787e9b.tar.gz newsboat-1e4a28928f8e6ef6b6809d30232824b80c787e9b.tar.zst newsboat-1e4a28928f8e6ef6b6809d30232824b80c787e9b.zip |
Merge pull request #2706 from danieloh0714/optimize-rssignores-matches
optimize RssIgnores::matches function
-rw-r--r-- | include/rssignores.h | 6 | ||||
-rw-r--r-- | src/matcher.cpp | 4 | ||||
-rw-r--r-- | src/rssignores.cpp | 66 | ||||
-rw-r--r-- | test/rssignores.cpp | 31 |
4 files changed, 74 insertions, 33 deletions
diff --git a/include/rssignores.h b/include/rssignores.h index f38aa90c..dce190e5 100644 --- a/include/rssignores.h +++ b/include/rssignores.h @@ -2,6 +2,7 @@ #define NEWSBOAT_RSSIGNORES_H_ #include <string> +#include <unordered_map> #include <vector> #include "configactionhandler.h" @@ -24,7 +25,10 @@ public: bool matches_resetunread(const std::string& url); private: - std::vector<FeedUrlExprPair> ignores; + bool matches_expr(std::shared_ptr<Matcher> expr, RssItem* item); + + std::vector<FeedUrlExprPair> regex_ignores; + std::unordered_multimap<std::string, std::shared_ptr<Matcher>> non_regex_ignores; std::vector<std::string> ignores_lastmodified; std::vector<std::string> resetflag; diff --git a/src/matcher.cpp b/src/matcher.cpp index ff162787..7f9bae74 100644 --- a/src/matcher.cpp +++ b/src/matcher.cpp @@ -63,7 +63,7 @@ bool Matcher::matches(Matchable* item) * you only have to implement the method attribute_value(). * * The whole matching code is speed-critical, as the matching happens on - * a lot of different occassions, and slow matching can be easily + * a lot of different occasions, and slow matching can be easily * measured (and felt by the user) on slow computers with a lot of items * to match. */ @@ -194,7 +194,7 @@ bool Matcher::matches_r(expression* e, Matchable* item) return matches_r(e->l, item) || matches_r(e->r, item); // same here - /* while the other operator connect an attribute with a value */ + /* while the other operators connect an attribute with a value */ case MATCHOP_EQ: return matchop_eq(e, item); diff --git a/src/rssignores.cpp b/src/rssignores.cpp index 14c7068a..4bc77f95 100644 --- a/src/rssignores.cpp +++ b/src/rssignores.cpp @@ -58,9 +58,11 @@ void RssIgnores::handle_action(const std::string& action, _("`%s' is not a valid regular expression: %s"), pattern, errorMessage)); } - } - ignores.push_back(FeedUrlExprPair(ignore_rssurl, m)); + regex_ignores.push_back(FeedUrlExprPair(pattern, m)); + } else { + non_regex_ignores.insert({ignore_rssurl, m}); + } } else if (action == "always-download") { if (params.empty()) { throw ConfigHandlerException(ActionHandlerStatus::TOO_FEW_PARAMS); @@ -84,7 +86,7 @@ void RssIgnores::handle_action(const std::string& action, void RssIgnores::dump_config(std::vector<std::string>& config_output) const { - for (const auto& ign : ignores) { + for (const auto& ign : non_regex_ignores) { std::string configline = "ignore-article "; if (ign.first == "*") { configline.append("*"); @@ -95,6 +97,13 @@ void RssIgnores::dump_config(std::vector<std::string>& config_output) const configline.append(utils::quote(ign.second->get_expression())); config_output.push_back(configline); } + for (const auto& ign : regex_ignores) { + std::string configline = "ignore-article "; + configline.append(utils::quote(REGEX_PREFIX + ign.first)); + configline.append(" "); + configline.append(utils::quote(ign.second->get_expression())); + config_output.push_back(configline); + } for (const auto& ign_lm : ignores_lastmodified) { config_output.push_back(strprintf::fmt( "always-download %s", utils::quote(ign_lm))); @@ -105,32 +114,49 @@ void RssIgnores::dump_config(std::vector<std::string>& config_output) const } } +bool RssIgnores::matches_expr(std::shared_ptr<Matcher> expr, RssItem* item) +{ + if (expr->matches(item)) { + LOG(Level::DEBUG, + "RssIgnores::matches_expr: found match"); + return true; + } + + return false; +} + bool RssIgnores::matches(RssItem* item) { - const int prefix_len = REGEX_PREFIX.length(); - for (const auto& ign : ignores) { - bool matched = false; + auto search = non_regex_ignores.equal_range(item->feedurl()); + for (auto itr = search.first; itr != search.second; itr++) { + if (matches_expr(itr->second, item)) { + return true; + } + } + + search = non_regex_ignores.equal_range("*"); + for (auto itr = search.first; itr != search.second; itr++) { + if (matches_expr(itr->second, item)) { + return true; + } + } + + for (const auto& ign : regex_ignores) { + const std::string pattern = ign.first; + LOG(Level::DEBUG, "RssIgnores::matches: ign.first = `%s' item->feedurl = `%s'", - ign.first, + pattern, item->feedurl()); - if (!ign.first.compare(0, prefix_len, REGEX_PREFIX)) { - const std::string pattern = ign.first.substr(prefix_len, ign.first.length() - prefix_len); - std::string errorMessage; - const auto regex = Regex::compile(pattern, REG_EXTENDED | REG_ICASE, errorMessage); - const auto matches = regex->matches(item->feedurl(), 1, 0); - matched = !matches.empty(); - } else { - matched = ign.first == "*" || item->feedurl() == ign.first; - } - - if (matched && ign.second->matches(item)) { - LOG(Level::DEBUG, - "RssIgnores::matches: found match"); + std::string errorMessage; + const auto regex = Regex::compile(pattern, REG_EXTENDED | REG_ICASE, errorMessage); + const auto matches = regex->matches(item->feedurl(), 1, 0); + if (!matches.empty() && matches_expr(ign.second, item)) { return true; } } + return false; } diff --git a/test/rssignores.cpp b/test/rssignores.cpp index f9d33583..a58f3c8e 100644 --- a/test/rssignores.cpp +++ b/test/rssignores.cpp @@ -1,5 +1,7 @@ #include "rssignores.h" +#include <set> + #include "3rd-party/catch.hpp" #include "cache.h" @@ -135,6 +137,7 @@ TEST_CASE("RssIgnores::dump_config() writes out all configured settings " ignores.handle_action(action, {"https://example.com/feed.xml", "author =~ \"Joe\""}); ignores.handle_action(action, {"*", "title # \"interesting\""}); ignores.handle_action(action, {"https://blog.example.com/joe/posts.xml", "guid # 123"}); + ignores.handle_action(action, {"regex:^https://.*", "author = \"John Doe\""}); std::vector<std::string> config; const auto comment = @@ -143,13 +146,18 @@ TEST_CASE("RssIgnores::dump_config() writes out all configured settings " ignores.dump_config(config); - REQUIRE(config.size() == 4); // three actions plus one comment - REQUIRE(config[0] == comment); - REQUIRE(config[1] == - R"#(ignore-article "https://example.com/feed.xml" "author =~ \"Joe\"")#"); - REQUIRE(config[2] == R"#(ignore-article * "title # \"interesting\"")#"); - REQUIRE(config[3] == - R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#"); + std::set<std::string> config_set(config.begin(), config.end()); + + REQUIRE(config.size() == 5); // four actions plus one comment + REQUIRE(config_set.count(comment) == 1); + REQUIRE(config_set.count( + R"#(ignore-article "https://example.com/feed.xml" "author =~ \"Joe\"")#") == 1); + REQUIRE(config_set.count( + R"#(ignore-article * "title # \"interesting\"")#") == 1); + REQUIRE(config_set.count( + R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#") == 1); + REQUIRE(config_set.count( + R"#(ignore-article "regex:^https://.*" "author = \"John Doe\"")#") == 1); } SECTION("`always-download`") { @@ -209,11 +217,14 @@ TEST_CASE("RssIgnores::dump_config() writes out all configured settings " ignores.dump_config(config); + std::set<std::string> config_set(config.begin(), config.end()); + REQUIRE(config.size() == 10); REQUIRE(config[0] == comment); - REQUIRE(config[1] == R"#(ignore-article * "title # \"interesting\"")#"); - REQUIRE(config[2] == - R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#"); + REQUIRE(config_set.count( + R"#(ignore-article * "title # \"interesting\"")#") == 1); + REQUIRE(config_set.count( + R"#(ignore-article "https://blog.example.com/joe/posts.xml" "guid # 123")#") == 1); REQUIRE(config[3] == R"#(always-download "url1")#"); REQUIRE(config[4] == R"#(always-download "url2")#"); REQUIRE(config[5] == R"#(always-download "url3")#"); |