Remove Redundant Parsing of Links (#4507)

Co-authored-by: pajlada <rasmus.karlsson@pajlada.com>
This commit is contained in:
nerix 2023-04-23 00:58:37 +02:00 committed by GitHub
parent f2938995c1
commit 95e7426283
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 168 additions and 147 deletions

View file

@ -35,6 +35,7 @@
- Dev: Add scripting capabilities with Lua (#4341, #4504) - Dev: Add scripting capabilities with Lua (#4341, #4504)
- Dev: Conan 2.0 is now used instead of Conan 1.0. (#4417) - Dev: Conan 2.0 is now used instead of Conan 1.0. (#4417)
- Dev: Added tests and benchmarks for `LinkParser`. (#4436) - Dev: Added tests and benchmarks for `LinkParser`. (#4436)
- Dev: Removed redundant parsing of links. (#4507)
- Dev: Experimental builds with Qt 6 are now provided. (#4522, #4551, #4553, #4554, #4555, #4556) - Dev: Experimental builds with Qt 6 are now provided. (#4522, #4551, #4553, #4554, #4555, #4556)
- Dev: Removed `CHATTERINO_TEST` definitions. (#4526) - Dev: Removed `CHATTERINO_TEST` definitions. (#4526)
- Dev: Builds for macOS now have `macos` in their name (previously: `osx`). (#4550) - Dev: Builds for macOS now have `macos` in their name (previously: `osx`). (#4550)

View file

@ -24,7 +24,7 @@ static void BM_LinkParsing(benchmark::State &state)
// Make sure the TLDs are loaded // Make sure the TLDs are loaded
{ {
benchmark::DoNotOptimize(LinkParser("xd.com").getCaptured()); benchmark::DoNotOptimize(LinkParser("xd.com").result());
} }
for (auto _ : state) for (auto _ : state)
@ -32,10 +32,7 @@ static void BM_LinkParsing(benchmark::State &state)
for (auto word : words) for (auto word : words)
{ {
LinkParser parser(word); LinkParser parser(word);
if (parser.hasMatch()) benchmark::DoNotOptimize(parser.result());
{
benchmark::DoNotOptimize(parser.getCaptured());
}
} }
} }
} }

View file

@ -67,7 +67,7 @@ namespace {
LinkParser::LinkParser(const QString &unparsedString) LinkParser::LinkParser(const QString &unparsedString)
{ {
this->match_ = unparsedString; ParsedLink result;
// This is not implemented with a regex to increase performance. // This is not implemented with a regex to increase performance.
// We keep removing parts of the url until there's either nothing left or we fail. // We keep removing parts of the url until there's either nothing left or we fail.
@ -79,11 +79,13 @@ LinkParser::LinkParser(const QString &unparsedString)
if (l.startsWith("https://", Qt::CaseInsensitive)) if (l.startsWith("https://", Qt::CaseInsensitive))
{ {
hasHttp = true; hasHttp = true;
result.protocol = l.mid(0, 8);
l = l.mid(8); l = l.mid(8);
} }
else if (l.startsWith("http://", Qt::CaseInsensitive)) else if (l.startsWith("http://", Qt::CaseInsensitive))
{ {
hasHttp = true; hasHttp = true;
result.protocol = l.mid(0, 7);
l = l.mid(7); l = l.mid(7);
} }
@ -92,8 +94,10 @@ LinkParser::LinkParser(const QString &unparsedString)
// Host `a.b.c.com` // Host `a.b.c.com`
QStringRef host = l; QStringRef host = l;
ParsedLink::StringView rest;
bool lastWasDot = true; bool lastWasDot = true;
bool inIpv6 = false; bool inIpv6 = false;
bool hasMatch = false;
for (int i = 0; i < l.size(); i++) for (int i = 0; i < l.size(); i++)
{ {
@ -111,24 +115,28 @@ LinkParser::LinkParser(const QString &unparsedString)
if (l[i] == ':' && !inIpv6) if (l[i] == ':' && !inIpv6)
{ {
host = l.mid(0, i); host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1); l = l.mid(i + 1);
goto parsePort; goto parsePort;
} }
else if (l[i] == '/') else if (l[i] == '/')
{ {
host = l.mid(0, i); host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1); l = l.mid(i + 1);
goto parsePath; goto parsePath;
} }
else if (l[i] == '?') else if (l[i] == '?')
{ {
host = l.mid(0, i); host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1); l = l.mid(i + 1);
goto parseQuery; goto parseQuery;
} }
else if (l[i] == '#') else if (l[i] == '#')
{ {
host = l.mid(0, i); host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1); l = l.mid(i + 1);
goto parseAnchor; goto parseAnchor;
} }
@ -176,32 +184,28 @@ parseAnchor:
done: done:
// check host // check host
this->hasMatch_ = isValidHostname(host) || isValidIpv4(host) hasMatch = isValidHostname(host) || isValidIpv4(host)
#ifdef C_MATCH_IPV6_LINK #ifdef C_MATCH_IPV6_LINK
|| (hasHttp && isValidIpv6(host)) || (hasHttp && isValidIpv6(host))
#endif #endif
; ;
if (this->hasMatch_) if (hasMatch)
{ {
this->match_ = unparsedString; result.host = host;
result.rest = rest;
result.source = unparsedString;
this->result_ = std::move(result);
} }
return;
error: error:
hasMatch_ = false; return;
} }
bool LinkParser::hasMatch() const const std::optional<ParsedLink> &LinkParser::result() const
{ {
return this->hasMatch_; return this->result_;
}
QString LinkParser::getCaptured() const
{
return this->match_;
} }
} // namespace chatterino } // namespace chatterino

View file

@ -2,19 +2,31 @@
#include <QString> #include <QString>
#include <optional>
namespace chatterino { namespace chatterino {
struct ParsedLink {
#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0)
using StringView = QStringView;
#else
using StringView = QStringRef;
#endif
StringView protocol;
StringView host;
StringView rest;
QString source;
};
class LinkParser class LinkParser
{ {
public: public:
explicit LinkParser(const QString &unparsedString); explicit LinkParser(const QString &unparsedString);
bool hasMatch() const; const std::optional<ParsedLink> &result() const;
QString getCaptured() const;
private: private:
bool hasMatch_{false}; std::optional<ParsedLink> result_{};
QString match_;
}; };
} // namespace chatterino } // namespace chatterino

View file

@ -2,6 +2,7 @@
#include "Application.hpp" #include "Application.hpp"
#include "common/Env.hpp" #include "common/Env.hpp"
#include "common/LinkParser.hpp"
#include "common/NetworkResult.hpp" #include "common/NetworkResult.hpp"
#include "common/QLogging.hpp" #include "common/QLogging.hpp"
#include "common/SignalVector.hpp" #include "common/SignalVector.hpp"
@ -148,15 +149,15 @@ bool appendWhisperMessageWordsLocally(const QStringList &words)
void operator()(const QString &string, void operator()(const QString &string,
MessageBuilder &b) const MessageBuilder &b) const
{ {
auto linkString = b.matchLink(string); LinkParser parser(string);
if (linkString.isEmpty()) if (parser.result())
{ {
b.emplace<TextElement>(string, b.addLink(*parser.result());
MessageElementFlag::Text);
} }
else else
{ {
b.addLink(string, linkString); b.emplace<TextElement>(string,
MessageElementFlag::Text);
} }
} }
} visitor; } visitor;

View file

@ -240,10 +240,10 @@ MessageBuilder::MessageBuilder(SystemMessageTag, const QString &text,
text.split(QRegularExpression("\\s"), Qt::SkipEmptyParts); text.split(QRegularExpression("\\s"), Qt::SkipEmptyParts);
for (const auto &word : textFragments) for (const auto &word : textFragments)
{ {
const auto linkString = this->matchLink(word); LinkParser parser(word);
if (!linkString.isEmpty()) if (parser.result())
{ {
this->addLink(word, linkString); this->addLink(*parser.result());
continue; continue;
} }
@ -707,52 +707,25 @@ std::unique_ptr<MessageElement> MessageBuilder::releaseBack()
return ptr; return ptr;
} }
QString MessageBuilder::matchLink(const QString &string) void MessageBuilder::addLink(const ParsedLink &parsedLink)
{ {
LinkParser linkParser(string);
static QRegularExpression httpRegex(
"\\bhttps?://", QRegularExpression::CaseInsensitiveOption);
static QRegularExpression ftpRegex(
"\\bftps?://", QRegularExpression::CaseInsensitiveOption);
static QRegularExpression spotifyRegex(
"\\bspotify:", QRegularExpression::CaseInsensitiveOption);
if (!linkParser.hasMatch())
{
return QString();
}
QString captured = linkParser.getCaptured();
if (!captured.contains(httpRegex) && !captured.contains(ftpRegex) &&
!captured.contains(spotifyRegex))
{
captured.insert(0, "http://");
}
return captured;
}
void MessageBuilder::addLink(const QString &origLink,
const QString &matchedLink)
{
static QRegularExpression domainRegex(
R"(^(?:(?:ftp|http)s?:\/\/)?([^\/]+)(?:\/.*)?$)",
QRegularExpression::CaseInsensitiveOption);
QString lowercaseLinkString; QString lowercaseLinkString;
auto match = domainRegex.match(origLink); QString origLink = parsedLink.source;
if (match.isValid()) QString matchedLink;
if (parsedLink.protocol.isNull())
{ {
lowercaseLinkString = origLink.mid(0, match.capturedStart(1)) + matchedLink = QStringLiteral("http://") + parsedLink.source;
match.captured(1).toLower() +
origLink.mid(match.capturedEnd(1));
} }
else else
{ {
lowercaseLinkString = origLink; lowercaseLinkString += parsedLink.protocol;
matchedLink = parsedLink.source;
} }
lowercaseLinkString += parsedLink.host.toString().toLower();
lowercaseLinkString += parsedLink.rest;
auto linkElement = Link(Link::Url, matchedLink); auto linkElement = Link(Link::Url, matchedLink);
auto textColor = MessageColor(MessageColor::Link); auto textColor = MessageColor(MessageColor::Link);
@ -816,12 +789,10 @@ void MessageBuilder::addIrcMessageText(const QString &text)
auto string = QString(word); auto string = QString(word);
// Actually just text // Actually just text
auto linkString = this->matchLink(string); LinkParser parser(string);
auto link = Link(); if (parser.result())
if (!linkString.isEmpty())
{ {
this->addLink(string, linkString); this->addLink(*parser.result());
continue; continue;
} }
@ -909,28 +880,24 @@ void MessageBuilder::addTextOrEmoji(const QString &string_)
auto string = QString(string_); auto string = QString(string_);
// Actually just text // Actually just text
auto linkString = this->matchLink(string); LinkParser linkParser(string);
auto link = Link(); if (linkParser.result())
{
this->addLink(*linkParser.result());
return;
}
auto &&textColor = this->textColor_; auto &&textColor = this->textColor_;
if (linkString.isEmpty())
{
if (string.startsWith('@')) if (string.startsWith('@'))
{ {
this->emplace<TextElement>(string, MessageElementFlag::BoldUsername, this->emplace<TextElement>(string, MessageElementFlag::BoldUsername,
textColor, FontStyle::ChatMediumBold); textColor, FontStyle::ChatMediumBold);
this->emplace<TextElement>( this->emplace<TextElement>(string, MessageElementFlag::NonBoldUsername,
string, MessageElementFlag::NonBoldUsername, textColor);
}
else
{
this->emplace<TextElement>(string, MessageElementFlag::Text,
textColor); textColor);
} }
}
else else
{ {
this->addLink(string, linkString); this->emplace<TextElement>(string, MessageElementFlag::Text, textColor);
} }
} }

View file

@ -23,6 +23,8 @@ class TextElement;
struct Emote; struct Emote;
using EmotePtr = std::shared_ptr<const Emote>; using EmotePtr = std::shared_ptr<const Emote>;
struct ParsedLink;
struct SystemMessageTag { struct SystemMessageTag {
}; };
struct TimeoutMessageTag { struct TimeoutMessageTag {
@ -94,8 +96,7 @@ public:
std::weak_ptr<Message> weakOf(); std::weak_ptr<Message> weakOf();
void append(std::unique_ptr<MessageElement> element); void append(std::unique_ptr<MessageElement> element);
QString matchLink(const QString &string); void addLink(const ParsedLink &parsedLink);
void addLink(const QString &origLink, const QString &matchedLink);
/** /**
* Adds the text, applies irc colors, adds links, * Adds the text, applies irc colors, adds links,

View file

@ -15,9 +15,11 @@ bool LinkPredicate::appliesToImpl(const Message &message)
{ {
for (const auto &word : message.messageText.split(' ', Qt::SkipEmptyParts)) for (const auto &word : message.messageText.split(' ', Qt::SkipEmptyParts))
{ {
if (LinkParser(word).hasMatch()) if (LinkParser(word).result())
{
return true; return true;
} }
}
return false; return false;
} }

View file

@ -1,6 +1,7 @@
#include "providers/twitch/TwitchMessageBuilder.hpp" #include "providers/twitch/TwitchMessageBuilder.hpp"
#include "Application.hpp" #include "Application.hpp"
#include "common/LinkParser.hpp"
#include "common/QLogging.hpp" #include "common/QLogging.hpp"
#include "controllers/accounts/AccountController.hpp" #include "controllers/accounts/AccountController.hpp"
#include "controllers/ignores/IgnoreController.hpp" #include "controllers/ignores/IgnoreController.hpp"
@ -465,12 +466,12 @@ void TwitchMessageBuilder::addTextOrEmoji(const QString &string_)
} }
// Actually just text // Actually just text
auto linkString = this->matchLink(string); LinkParser parsed(string);
auto textColor = this->textColor_; auto textColor = this->textColor_;
if (!linkString.isEmpty()) if (parsed.result())
{ {
this->addLink(string, linkString); this->addLink(*parsed.result());
return; return;
} }

View file

@ -6,66 +6,96 @@
using namespace chatterino; using namespace chatterino;
struct Case {
QString protocol{};
QString host{};
QString rest{};
void check() const
{
auto input = this->protocol + this->host + this->rest;
LinkParser p(input);
ASSERT_TRUE(p.result().has_value()) << input.toStdString();
const auto &r = *p.result();
ASSERT_EQ(r.source, input);
ASSERT_EQ(r.protocol, this->protocol) << this->protocol.toStdString();
ASSERT_EQ(r.host, this->host) << this->host.toStdString();
ASSERT_EQ(r.rest, this->rest) << this->rest.toStdString();
}
};
TEST(LinkParser, parseDomainLinks) TEST(LinkParser, parseDomainLinks)
{ {
const QStringList inputs = { const QList<Case> cases = {
"https://chatterino.com", {"https://", "chatterino.com"},
"http://chatterino.com", {"http://", "chatterino.com"},
"chatterino.com", {"", "chatterino.com"},
"wiki.chatterino.com", {"", "wiki.chatterino.com"},
"https://wiki.chatterino.com", {"https://", "wiki.chatterino.com"},
"http://chatterino.co.uk", {"http://", "chatterino.co.uk"},
"http://a.io", {"http://", "a.io"},
"chatterino.com:80", {"", "chatterino.com", ":80"},
"wiki.chatterino.com:80", {"", "wiki.chatterino.com", ":80"},
"a.b.c.chatterino.com", {"", "wiki.chatterino.com", ":80/foo/bar"},
"https://a.b.c.chatterino.com/foo", {"", "wiki.chatterino.com", "/:80?foo/bar"},
"http://chatterino.com?foo", {"", "wiki.chatterino.com", "/127.0.0.1"},
"http://xd.chatterino.com/#?foo", {"", "a.b.c.chatterino.com"},
"chatterino.com#foo", {"https://", "a.b.c.chatterino.com", "/foo"},
"1.com", {"http://", "chatterino.com", "?foo"},
"127.0.0.1.com", {"http://", "xd.chatterino.com", "/#?foo"},
"https://127.0.0.1.com", {"", "chatterino.com", "#foo"},
{"", "1.com"},
{"", "127.0.0.1.com"},
{"https://", "127.0.0.1.com"},
// test case-insensitiveness
{"HtTpS://", "127.0.0.1.CoM"},
{"HTTP://", "XD.CHATTERINO.COM", "/#?FOO"},
{"HTTPS://", "wikI.chatterino.com"},
{"", "chatterino.Org", "#foo"},
{"", "CHATTERINO.com", ""},
}; };
for (const auto &input : inputs) for (const auto &c : cases)
{ {
LinkParser p(input); c.check();
ASSERT_TRUE(p.hasMatch()) << input.toStdString();
ASSERT_EQ(p.getCaptured(), input);
} }
} }
TEST(LinkParser, parseIpv4Links) TEST(LinkParser, parseIpv4Links)
{ {
const QStringList inputs = { const QList<Case> cases = {
"https://127.0.0.1", {"https://", "127.0.0.1"},
"http://127.0.0.1", {"http://", "127.0.0.1"},
"127.0.0.1", {"", "127.0.0.1"},
"127.0.0.1:8080", {"", "127.0.0.1", ":8080"},
"255.255.255.255", {"", "255.255.255.255"},
"0.0.0.0", {"", "0.0.0.0"},
"1.1.1.1", {"", "1.1.1.1"},
"001.001.01.1", {"", "001.001.01.1"},
"123.246.87.0", {"", "123.246.87.0"},
"196.168.0.1:", {"", "196.168.0.1", ":"},
"196.168.4.2/foo", {"", "196.168.4.2", "/foo"},
"196.168.4.2?foo", {"", "196.168.4.2", "?foo"},
"http://196.168.4.0#foo", {"http://", "196.168.4.0", "#foo"},
"196.168.4.0/?#foo", {"", "196.168.4.0", "/?#foo"},
"196.168.4.0#?/foo", {"", "196.168.4.0", "#?/foo"},
"256.255.255.255", {"", "256.255.255.255"},
"http://256.255.255.255", {"http://", "256.255.255.255"},
"255.256.255.255", {"", "255.256.255.255"},
"255.255.256.255", {"", "255.255.256.255"},
"255.255.255.256", {"", "255.255.255.256"},
// test case-insensitiveness
{"HTTP://", "196.168.4.0", "#Foo"},
{"HTTPS://", "196.168.4.0", "#Foo"},
{"htTp://", "127.0.0.1"},
{"httpS://", "127.0.0.1"},
}; };
for (const auto &input : inputs) for (const auto &c : cases)
{ {
LinkParser p(input); c.check();
ASSERT_TRUE(p.hasMatch()) << input.toStdString();
ASSERT_EQ(p.getCaptured(), input);
} }
} }
@ -80,12 +110,14 @@ TEST(LinkParser, doesntParseInvalidIpv4Links)
"1.2", "1.2",
"1", "1",
"1.2.3", "1.2.3",
"htt://256.255.255.255",
"aliens://256.255.255.255",
}; };
for (const auto &input : inputs) for (const auto &input : inputs)
{ {
LinkParser p(input); LinkParser p(input);
ASSERT_FALSE(p.hasMatch()) << input.toStdString(); ASSERT_FALSE(p.result().has_value()) << input.toStdString();
} }
} }
@ -99,6 +131,9 @@ TEST(LinkParser, doesntParseInvalidLinks)
"spotify://chatterino.com", "spotify://chatterino.com",
"httpsx://chatterino.com", "httpsx://chatterino.com",
"https:chatterino.com", "https:chatterino.com",
"https:/chatterino.com",
"http:/chatterino.com",
"htp://chatterino.com",
"/chatterino.com", "/chatterino.com",
"word", "word",
".", ".",
@ -114,6 +149,6 @@ TEST(LinkParser, doesntParseInvalidLinks)
for (const auto &input : inputs) for (const auto &input : inputs)
{ {
LinkParser p(input); LinkParser p(input);
ASSERT_FALSE(p.hasMatch()) << input.toStdString(); ASSERT_FALSE(p.result().has_value()) << input.toStdString();
} }
} }