From 95e7426283cf2e592589fe19f58377670b6a6ee6 Mon Sep 17 00:00:00 2001 From: nerix Date: Sun, 23 Apr 2023 00:58:37 +0200 Subject: [PATCH] Remove Redundant Parsing of Links (#4507) Co-authored-by: pajlada --- CHANGELOG.md | 1 + benchmarks/src/LinkParser.cpp | 7 +- src/common/LinkParser.cpp | 34 +++-- src/common/LinkParser.hpp | 20 ++- .../commands/CommandController.cpp | 11 +- src/messages/MessageBuilder.cpp | 93 ++++-------- src/messages/MessageBuilder.hpp | 5 +- src/messages/search/LinkPredicate.cpp | 4 +- src/providers/twitch/TwitchMessageBuilder.cpp | 7 +- tests/src/LinkParser.cpp | 133 +++++++++++------- 10 files changed, 168 insertions(+), 147 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50d0f99aa..6db61c51a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ - Dev: Add scripting capabilities with Lua (#4341, #4504) - Dev: Conan 2.0 is now used instead of Conan 1.0. (#4417) - Dev: Added tests and benchmarks for `LinkParser`. (#4436) +- Dev: Removed redundant parsing of links. (#4507) - Dev: Experimental builds with Qt 6 are now provided. (#4522, #4551, #4553, #4554, #4555, #4556) - Dev: Removed `CHATTERINO_TEST` definitions. (#4526) - Dev: Builds for macOS now have `macos` in their name (previously: `osx`). (#4550) diff --git a/benchmarks/src/LinkParser.cpp b/benchmarks/src/LinkParser.cpp index 9178e63ee..3a3312428 100644 --- a/benchmarks/src/LinkParser.cpp +++ b/benchmarks/src/LinkParser.cpp @@ -24,7 +24,7 @@ static void BM_LinkParsing(benchmark::State &state) // Make sure the TLDs are loaded { - benchmark::DoNotOptimize(LinkParser("xd.com").getCaptured()); + benchmark::DoNotOptimize(LinkParser("xd.com").result()); } for (auto _ : state) @@ -32,10 +32,7 @@ static void BM_LinkParsing(benchmark::State &state) for (auto word : words) { LinkParser parser(word); - if (parser.hasMatch()) - { - benchmark::DoNotOptimize(parser.getCaptured()); - } + benchmark::DoNotOptimize(parser.result()); } } } diff --git a/src/common/LinkParser.cpp b/src/common/LinkParser.cpp index 7e82359b9..5587189c8 100644 --- a/src/common/LinkParser.cpp +++ b/src/common/LinkParser.cpp @@ -67,7 +67,7 @@ namespace { LinkParser::LinkParser(const QString &unparsedString) { - this->match_ = unparsedString; + ParsedLink result; // This is not implemented with a regex to increase performance. // We keep removing parts of the url until there's either nothing left or we fail. @@ -79,11 +79,13 @@ LinkParser::LinkParser(const QString &unparsedString) if (l.startsWith("https://", Qt::CaseInsensitive)) { hasHttp = true; + result.protocol = l.mid(0, 8); l = l.mid(8); } else if (l.startsWith("http://", Qt::CaseInsensitive)) { hasHttp = true; + result.protocol = l.mid(0, 7); l = l.mid(7); } @@ -92,8 +94,10 @@ LinkParser::LinkParser(const QString &unparsedString) // Host `a.b.c.com` QStringRef host = l; + ParsedLink::StringView rest; bool lastWasDot = true; bool inIpv6 = false; + bool hasMatch = false; for (int i = 0; i < l.size(); i++) { @@ -111,24 +115,28 @@ LinkParser::LinkParser(const QString &unparsedString) if (l[i] == ':' && !inIpv6) { host = l.mid(0, i); + rest = l.mid(i); l = l.mid(i + 1); goto parsePort; } else if (l[i] == '/') { host = l.mid(0, i); + rest = l.mid(i); l = l.mid(i + 1); goto parsePath; } else if (l[i] == '?') { host = l.mid(0, i); + rest = l.mid(i); l = l.mid(i + 1); goto parseQuery; } else if (l[i] == '#') { host = l.mid(0, i); + rest = l.mid(i); l = l.mid(i + 1); goto parseAnchor; } @@ -176,32 +184,28 @@ parseAnchor: done: // check host - this->hasMatch_ = isValidHostname(host) || isValidIpv4(host) + hasMatch = isValidHostname(host) || isValidIpv4(host) #ifdef C_MATCH_IPV6_LINK - || (hasHttp && isValidIpv6(host)) + || (hasHttp && isValidIpv6(host)) #endif ; - if (this->hasMatch_) + if (hasMatch) { - this->match_ = unparsedString; + result.host = host; + result.rest = rest; + result.source = unparsedString; + this->result_ = std::move(result); } - return; - error: - hasMatch_ = false; + return; } -bool LinkParser::hasMatch() const +const std::optional &LinkParser::result() const { - return this->hasMatch_; -} - -QString LinkParser::getCaptured() const -{ - return this->match_; + return this->result_; } } // namespace chatterino diff --git a/src/common/LinkParser.hpp b/src/common/LinkParser.hpp index 8350f4668..16bfe235e 100644 --- a/src/common/LinkParser.hpp +++ b/src/common/LinkParser.hpp @@ -2,19 +2,31 @@ #include +#include + namespace chatterino { +struct ParsedLink { +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) + using StringView = QStringView; +#else + using StringView = QStringRef; +#endif + StringView protocol; + StringView host; + StringView rest; + QString source; +}; + class LinkParser { public: explicit LinkParser(const QString &unparsedString); - bool hasMatch() const; - QString getCaptured() const; + const std::optional &result() const; private: - bool hasMatch_{false}; - QString match_; + std::optional result_{}; }; } // namespace chatterino diff --git a/src/controllers/commands/CommandController.cpp b/src/controllers/commands/CommandController.cpp index 302dfd9d3..39b9293f7 100644 --- a/src/controllers/commands/CommandController.cpp +++ b/src/controllers/commands/CommandController.cpp @@ -2,6 +2,7 @@ #include "Application.hpp" #include "common/Env.hpp" +#include "common/LinkParser.hpp" #include "common/NetworkResult.hpp" #include "common/QLogging.hpp" #include "common/SignalVector.hpp" @@ -148,15 +149,15 @@ bool appendWhisperMessageWordsLocally(const QStringList &words) void operator()(const QString &string, MessageBuilder &b) const { - auto linkString = b.matchLink(string); - if (linkString.isEmpty()) + LinkParser parser(string); + if (parser.result()) { - b.emplace(string, - MessageElementFlag::Text); + b.addLink(*parser.result()); } else { - b.addLink(string, linkString); + b.emplace(string, + MessageElementFlag::Text); } } } visitor; diff --git a/src/messages/MessageBuilder.cpp b/src/messages/MessageBuilder.cpp index bc0cfdbab..0eb80017c 100644 --- a/src/messages/MessageBuilder.cpp +++ b/src/messages/MessageBuilder.cpp @@ -240,10 +240,10 @@ MessageBuilder::MessageBuilder(SystemMessageTag, const QString &text, text.split(QRegularExpression("\\s"), Qt::SkipEmptyParts); for (const auto &word : textFragments) { - const auto linkString = this->matchLink(word); - if (!linkString.isEmpty()) + LinkParser parser(word); + if (parser.result()) { - this->addLink(word, linkString); + this->addLink(*parser.result()); continue; } @@ -707,52 +707,25 @@ std::unique_ptr MessageBuilder::releaseBack() return ptr; } -QString MessageBuilder::matchLink(const QString &string) +void MessageBuilder::addLink(const ParsedLink &parsedLink) { - LinkParser linkParser(string); - - static QRegularExpression httpRegex( - "\\bhttps?://", QRegularExpression::CaseInsensitiveOption); - static QRegularExpression ftpRegex( - "\\bftps?://", QRegularExpression::CaseInsensitiveOption); - static QRegularExpression spotifyRegex( - "\\bspotify:", QRegularExpression::CaseInsensitiveOption); - - if (!linkParser.hasMatch()) - { - return QString(); - } - - QString captured = linkParser.getCaptured(); - - if (!captured.contains(httpRegex) && !captured.contains(ftpRegex) && - !captured.contains(spotifyRegex)) - { - captured.insert(0, "http://"); - } - - return captured; -} - -void MessageBuilder::addLink(const QString &origLink, - const QString &matchedLink) -{ - static QRegularExpression domainRegex( - R"(^(?:(?:ftp|http)s?:\/\/)?([^\/]+)(?:\/.*)?$)", - QRegularExpression::CaseInsensitiveOption); - QString lowercaseLinkString; - auto match = domainRegex.match(origLink); - if (match.isValid()) + QString origLink = parsedLink.source; + QString matchedLink; + + if (parsedLink.protocol.isNull()) { - lowercaseLinkString = origLink.mid(0, match.capturedStart(1)) + - match.captured(1).toLower() + - origLink.mid(match.capturedEnd(1)); + matchedLink = QStringLiteral("http://") + parsedLink.source; } else { - lowercaseLinkString = origLink; + lowercaseLinkString += parsedLink.protocol; + matchedLink = parsedLink.source; } + + lowercaseLinkString += parsedLink.host.toString().toLower(); + lowercaseLinkString += parsedLink.rest; + auto linkElement = Link(Link::Url, matchedLink); auto textColor = MessageColor(MessageColor::Link); @@ -816,12 +789,10 @@ void MessageBuilder::addIrcMessageText(const QString &text) auto string = QString(word); // Actually just text - auto linkString = this->matchLink(string); - auto link = Link(); - - if (!linkString.isEmpty()) + LinkParser parser(string); + if (parser.result()) { - this->addLink(string, linkString); + this->addLink(*parser.result()); continue; } @@ -909,28 +880,24 @@ void MessageBuilder::addTextOrEmoji(const QString &string_) auto string = QString(string_); // Actually just text - auto linkString = this->matchLink(string); - auto link = Link(); + LinkParser linkParser(string); + if (linkParser.result()) + { + this->addLink(*linkParser.result()); + return; + } auto &&textColor = this->textColor_; - if (linkString.isEmpty()) + if (string.startsWith('@')) { - if (string.startsWith('@')) - { - this->emplace(string, MessageElementFlag::BoldUsername, - textColor, FontStyle::ChatMediumBold); - this->emplace( - string, MessageElementFlag::NonBoldUsername, textColor); - } - else - { - this->emplace(string, MessageElementFlag::Text, - textColor); - } + this->emplace(string, MessageElementFlag::BoldUsername, + textColor, FontStyle::ChatMediumBold); + this->emplace(string, MessageElementFlag::NonBoldUsername, + textColor); } else { - this->addLink(string, linkString); + this->emplace(string, MessageElementFlag::Text, textColor); } } diff --git a/src/messages/MessageBuilder.hpp b/src/messages/MessageBuilder.hpp index 9a7d68643..5814b44a4 100644 --- a/src/messages/MessageBuilder.hpp +++ b/src/messages/MessageBuilder.hpp @@ -23,6 +23,8 @@ class TextElement; struct Emote; using EmotePtr = std::shared_ptr; +struct ParsedLink; + struct SystemMessageTag { }; struct TimeoutMessageTag { @@ -94,8 +96,7 @@ public: std::weak_ptr weakOf(); void append(std::unique_ptr element); - QString matchLink(const QString &string); - void addLink(const QString &origLink, const QString &matchedLink); + void addLink(const ParsedLink &parsedLink); /** * Adds the text, applies irc colors, adds links, diff --git a/src/messages/search/LinkPredicate.cpp b/src/messages/search/LinkPredicate.cpp index be400feb7..69442069c 100644 --- a/src/messages/search/LinkPredicate.cpp +++ b/src/messages/search/LinkPredicate.cpp @@ -15,8 +15,10 @@ bool LinkPredicate::appliesToImpl(const Message &message) { for (const auto &word : message.messageText.split(' ', Qt::SkipEmptyParts)) { - if (LinkParser(word).hasMatch()) + if (LinkParser(word).result()) + { return true; + } } return false; diff --git a/src/providers/twitch/TwitchMessageBuilder.cpp b/src/providers/twitch/TwitchMessageBuilder.cpp index ab95cebc0..98888fa48 100644 --- a/src/providers/twitch/TwitchMessageBuilder.cpp +++ b/src/providers/twitch/TwitchMessageBuilder.cpp @@ -1,6 +1,7 @@ #include "providers/twitch/TwitchMessageBuilder.hpp" #include "Application.hpp" +#include "common/LinkParser.hpp" #include "common/QLogging.hpp" #include "controllers/accounts/AccountController.hpp" #include "controllers/ignores/IgnoreController.hpp" @@ -465,12 +466,12 @@ void TwitchMessageBuilder::addTextOrEmoji(const QString &string_) } // Actually just text - auto linkString = this->matchLink(string); + LinkParser parsed(string); auto textColor = this->textColor_; - if (!linkString.isEmpty()) + if (parsed.result()) { - this->addLink(string, linkString); + this->addLink(*parsed.result()); return; } diff --git a/tests/src/LinkParser.cpp b/tests/src/LinkParser.cpp index 33c3a7adc..aaf35e957 100644 --- a/tests/src/LinkParser.cpp +++ b/tests/src/LinkParser.cpp @@ -6,66 +6,96 @@ using namespace chatterino; +struct Case { + QString protocol{}; + QString host{}; + QString rest{}; + + void check() const + { + auto input = this->protocol + this->host + this->rest; + LinkParser p(input); + ASSERT_TRUE(p.result().has_value()) << input.toStdString(); + + const auto &r = *p.result(); + ASSERT_EQ(r.source, input); + ASSERT_EQ(r.protocol, this->protocol) << this->protocol.toStdString(); + ASSERT_EQ(r.host, this->host) << this->host.toStdString(); + ASSERT_EQ(r.rest, this->rest) << this->rest.toStdString(); + } +}; + TEST(LinkParser, parseDomainLinks) { - const QStringList inputs = { - "https://chatterino.com", - "http://chatterino.com", - "chatterino.com", - "wiki.chatterino.com", - "https://wiki.chatterino.com", - "http://chatterino.co.uk", - "http://a.io", - "chatterino.com:80", - "wiki.chatterino.com:80", - "a.b.c.chatterino.com", - "https://a.b.c.chatterino.com/foo", - "http://chatterino.com?foo", - "http://xd.chatterino.com/#?foo", - "chatterino.com#foo", - "1.com", - "127.0.0.1.com", - "https://127.0.0.1.com", + const QList cases = { + {"https://", "chatterino.com"}, + {"http://", "chatterino.com"}, + {"", "chatterino.com"}, + {"", "wiki.chatterino.com"}, + {"https://", "wiki.chatterino.com"}, + {"http://", "chatterino.co.uk"}, + {"http://", "a.io"}, + {"", "chatterino.com", ":80"}, + {"", "wiki.chatterino.com", ":80"}, + {"", "wiki.chatterino.com", ":80/foo/bar"}, + {"", "wiki.chatterino.com", "/:80?foo/bar"}, + {"", "wiki.chatterino.com", "/127.0.0.1"}, + {"", "a.b.c.chatterino.com"}, + {"https://", "a.b.c.chatterino.com", "/foo"}, + {"http://", "chatterino.com", "?foo"}, + {"http://", "xd.chatterino.com", "/#?foo"}, + {"", "chatterino.com", "#foo"}, + {"", "1.com"}, + {"", "127.0.0.1.com"}, + {"https://", "127.0.0.1.com"}, + // test case-insensitiveness + {"HtTpS://", "127.0.0.1.CoM"}, + {"HTTP://", "XD.CHATTERINO.COM", "/#?FOO"}, + {"HTTPS://", "wikI.chatterino.com"}, + {"", "chatterino.Org", "#foo"}, + {"", "CHATTERINO.com", ""}, }; - for (const auto &input : inputs) + for (const auto &c : cases) { - LinkParser p(input); - ASSERT_TRUE(p.hasMatch()) << input.toStdString(); - ASSERT_EQ(p.getCaptured(), input); + c.check(); } } TEST(LinkParser, parseIpv4Links) { - const QStringList inputs = { - "https://127.0.0.1", - "http://127.0.0.1", - "127.0.0.1", - "127.0.0.1:8080", - "255.255.255.255", - "0.0.0.0", - "1.1.1.1", - "001.001.01.1", - "123.246.87.0", - "196.168.0.1:", - "196.168.4.2/foo", - "196.168.4.2?foo", - "http://196.168.4.0#foo", - "196.168.4.0/?#foo", - "196.168.4.0#?/foo", - "256.255.255.255", - "http://256.255.255.255", - "255.256.255.255", - "255.255.256.255", - "255.255.255.256", + const QList cases = { + {"https://", "127.0.0.1"}, + {"http://", "127.0.0.1"}, + {"", "127.0.0.1"}, + {"", "127.0.0.1", ":8080"}, + {"", "255.255.255.255"}, + {"", "0.0.0.0"}, + {"", "1.1.1.1"}, + {"", "001.001.01.1"}, + {"", "123.246.87.0"}, + {"", "196.168.0.1", ":"}, + {"", "196.168.4.2", "/foo"}, + {"", "196.168.4.2", "?foo"}, + {"http://", "196.168.4.0", "#foo"}, + {"", "196.168.4.0", "/?#foo"}, + {"", "196.168.4.0", "#?/foo"}, + {"", "256.255.255.255"}, + {"http://", "256.255.255.255"}, + {"", "255.256.255.255"}, + {"", "255.255.256.255"}, + {"", "255.255.255.256"}, + // test case-insensitiveness + {"HTTP://", "196.168.4.0", "#Foo"}, + {"HTTPS://", "196.168.4.0", "#Foo"}, + {"htTp://", "127.0.0.1"}, + {"httpS://", "127.0.0.1"}, + }; - for (const auto &input : inputs) + for (const auto &c : cases) { - LinkParser p(input); - ASSERT_TRUE(p.hasMatch()) << input.toStdString(); - ASSERT_EQ(p.getCaptured(), input); + c.check(); } } @@ -80,12 +110,14 @@ TEST(LinkParser, doesntParseInvalidIpv4Links) "1.2", "1", "1.2.3", + "htt://256.255.255.255", + "aliens://256.255.255.255", }; for (const auto &input : inputs) { LinkParser p(input); - ASSERT_FALSE(p.hasMatch()) << input.toStdString(); + ASSERT_FALSE(p.result().has_value()) << input.toStdString(); } } @@ -99,6 +131,9 @@ TEST(LinkParser, doesntParseInvalidLinks) "spotify://chatterino.com", "httpsx://chatterino.com", "https:chatterino.com", + "https:/chatterino.com", + "http:/chatterino.com", + "htp://chatterino.com", "/chatterino.com", "word", ".", @@ -114,6 +149,6 @@ TEST(LinkParser, doesntParseInvalidLinks) for (const auto &input : inputs) { LinkParser p(input); - ASSERT_FALSE(p.hasMatch()) << input.toStdString(); + ASSERT_FALSE(p.result().has_value()) << input.toStdString(); } }