Remove Redundant Parsing of Links (#4507)

Co-authored-by: pajlada <rasmus.karlsson@pajlada.com>
This commit is contained in:
nerix 2023-04-23 00:58:37 +02:00 committed by GitHub
parent f2938995c1
commit 95e7426283
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 168 additions and 147 deletions

View file

@ -35,6 +35,7 @@
- Dev: Add scripting capabilities with Lua (#4341, #4504)
- Dev: Conan 2.0 is now used instead of Conan 1.0. (#4417)
- Dev: Added tests and benchmarks for `LinkParser`. (#4436)
- Dev: Removed redundant parsing of links. (#4507)
- Dev: Experimental builds with Qt 6 are now provided. (#4522, #4551, #4553, #4554, #4555, #4556)
- Dev: Removed `CHATTERINO_TEST` definitions. (#4526)
- Dev: Builds for macOS now have `macos` in their name (previously: `osx`). (#4550)

View file

@ -24,7 +24,7 @@ static void BM_LinkParsing(benchmark::State &state)
// Make sure the TLDs are loaded
{
benchmark::DoNotOptimize(LinkParser("xd.com").getCaptured());
benchmark::DoNotOptimize(LinkParser("xd.com").result());
}
for (auto _ : state)
@ -32,10 +32,7 @@ static void BM_LinkParsing(benchmark::State &state)
for (auto word : words)
{
LinkParser parser(word);
if (parser.hasMatch())
{
benchmark::DoNotOptimize(parser.getCaptured());
}
benchmark::DoNotOptimize(parser.result());
}
}
}

View file

@ -67,7 +67,7 @@ namespace {
LinkParser::LinkParser(const QString &unparsedString)
{
this->match_ = unparsedString;
ParsedLink result;
// This is not implemented with a regex to increase performance.
// We keep removing parts of the url until there's either nothing left or we fail.
@ -79,11 +79,13 @@ LinkParser::LinkParser(const QString &unparsedString)
if (l.startsWith("https://", Qt::CaseInsensitive))
{
hasHttp = true;
result.protocol = l.mid(0, 8);
l = l.mid(8);
}
else if (l.startsWith("http://", Qt::CaseInsensitive))
{
hasHttp = true;
result.protocol = l.mid(0, 7);
l = l.mid(7);
}
@ -92,8 +94,10 @@ LinkParser::LinkParser(const QString &unparsedString)
// Host `a.b.c.com`
QStringRef host = l;
ParsedLink::StringView rest;
bool lastWasDot = true;
bool inIpv6 = false;
bool hasMatch = false;
for (int i = 0; i < l.size(); i++)
{
@ -111,24 +115,28 @@ LinkParser::LinkParser(const QString &unparsedString)
if (l[i] == ':' && !inIpv6)
{
host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1);
goto parsePort;
}
else if (l[i] == '/')
{
host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1);
goto parsePath;
}
else if (l[i] == '?')
{
host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1);
goto parseQuery;
}
else if (l[i] == '#')
{
host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1);
goto parseAnchor;
}
@ -176,32 +184,28 @@ parseAnchor:
done:
// check host
this->hasMatch_ = isValidHostname(host) || isValidIpv4(host)
hasMatch = isValidHostname(host) || isValidIpv4(host)
#ifdef C_MATCH_IPV6_LINK
|| (hasHttp && isValidIpv6(host))
#endif
;
if (this->hasMatch_)
if (hasMatch)
{
this->match_ = unparsedString;
result.host = host;
result.rest = rest;
result.source = unparsedString;
this->result_ = std::move(result);
}
return;
error:
hasMatch_ = false;
return;
}
bool LinkParser::hasMatch() const
const std::optional<ParsedLink> &LinkParser::result() const
{
return this->hasMatch_;
}
QString LinkParser::getCaptured() const
{
return this->match_;
return this->result_;
}
} // namespace chatterino

View file

@ -2,19 +2,31 @@
#include <QString>
#include <optional>
namespace chatterino {
struct ParsedLink {
#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0)
using StringView = QStringView;
#else
using StringView = QStringRef;
#endif
StringView protocol;
StringView host;
StringView rest;
QString source;
};
class LinkParser
{
public:
explicit LinkParser(const QString &unparsedString);
bool hasMatch() const;
QString getCaptured() const;
const std::optional<ParsedLink> &result() const;
private:
bool hasMatch_{false};
QString match_;
std::optional<ParsedLink> result_{};
};
} // namespace chatterino

View file

@ -2,6 +2,7 @@
#include "Application.hpp"
#include "common/Env.hpp"
#include "common/LinkParser.hpp"
#include "common/NetworkResult.hpp"
#include "common/QLogging.hpp"
#include "common/SignalVector.hpp"
@ -148,15 +149,15 @@ bool appendWhisperMessageWordsLocally(const QStringList &words)
void operator()(const QString &string,
MessageBuilder &b) const
{
auto linkString = b.matchLink(string);
if (linkString.isEmpty())
LinkParser parser(string);
if (parser.result())
{
b.emplace<TextElement>(string,
MessageElementFlag::Text);
b.addLink(*parser.result());
}
else
{
b.addLink(string, linkString);
b.emplace<TextElement>(string,
MessageElementFlag::Text);
}
}
} visitor;

View file

@ -240,10 +240,10 @@ MessageBuilder::MessageBuilder(SystemMessageTag, const QString &text,
text.split(QRegularExpression("\\s"), Qt::SkipEmptyParts);
for (const auto &word : textFragments)
{
const auto linkString = this->matchLink(word);
if (!linkString.isEmpty())
LinkParser parser(word);
if (parser.result())
{
this->addLink(word, linkString);
this->addLink(*parser.result());
continue;
}
@ -707,52 +707,25 @@ std::unique_ptr<MessageElement> MessageBuilder::releaseBack()
return ptr;
}
QString MessageBuilder::matchLink(const QString &string)
void MessageBuilder::addLink(const ParsedLink &parsedLink)
{
LinkParser linkParser(string);
static QRegularExpression httpRegex(
"\\bhttps?://", QRegularExpression::CaseInsensitiveOption);
static QRegularExpression ftpRegex(
"\\bftps?://", QRegularExpression::CaseInsensitiveOption);
static QRegularExpression spotifyRegex(
"\\bspotify:", QRegularExpression::CaseInsensitiveOption);
if (!linkParser.hasMatch())
{
return QString();
}
QString captured = linkParser.getCaptured();
if (!captured.contains(httpRegex) && !captured.contains(ftpRegex) &&
!captured.contains(spotifyRegex))
{
captured.insert(0, "http://");
}
return captured;
}
void MessageBuilder::addLink(const QString &origLink,
const QString &matchedLink)
{
static QRegularExpression domainRegex(
R"(^(?:(?:ftp|http)s?:\/\/)?([^\/]+)(?:\/.*)?$)",
QRegularExpression::CaseInsensitiveOption);
QString lowercaseLinkString;
auto match = domainRegex.match(origLink);
if (match.isValid())
QString origLink = parsedLink.source;
QString matchedLink;
if (parsedLink.protocol.isNull())
{
lowercaseLinkString = origLink.mid(0, match.capturedStart(1)) +
match.captured(1).toLower() +
origLink.mid(match.capturedEnd(1));
matchedLink = QStringLiteral("http://") + parsedLink.source;
}
else
{
lowercaseLinkString = origLink;
lowercaseLinkString += parsedLink.protocol;
matchedLink = parsedLink.source;
}
lowercaseLinkString += parsedLink.host.toString().toLower();
lowercaseLinkString += parsedLink.rest;
auto linkElement = Link(Link::Url, matchedLink);
auto textColor = MessageColor(MessageColor::Link);
@ -816,12 +789,10 @@ void MessageBuilder::addIrcMessageText(const QString &text)
auto string = QString(word);
// Actually just text
auto linkString = this->matchLink(string);
auto link = Link();
if (!linkString.isEmpty())
LinkParser parser(string);
if (parser.result())
{
this->addLink(string, linkString);
this->addLink(*parser.result());
continue;
}
@ -909,28 +880,24 @@ void MessageBuilder::addTextOrEmoji(const QString &string_)
auto string = QString(string_);
// Actually just text
auto linkString = this->matchLink(string);
auto link = Link();
LinkParser linkParser(string);
if (linkParser.result())
{
this->addLink(*linkParser.result());
return;
}
auto &&textColor = this->textColor_;
if (linkString.isEmpty())
{
if (string.startsWith('@'))
{
this->emplace<TextElement>(string, MessageElementFlag::BoldUsername,
textColor, FontStyle::ChatMediumBold);
this->emplace<TextElement>(
string, MessageElementFlag::NonBoldUsername, textColor);
}
else
{
this->emplace<TextElement>(string, MessageElementFlag::Text,
this->emplace<TextElement>(string, MessageElementFlag::NonBoldUsername,
textColor);
}
}
else
{
this->addLink(string, linkString);
this->emplace<TextElement>(string, MessageElementFlag::Text, textColor);
}
}

View file

@ -23,6 +23,8 @@ class TextElement;
struct Emote;
using EmotePtr = std::shared_ptr<const Emote>;
struct ParsedLink;
struct SystemMessageTag {
};
struct TimeoutMessageTag {
@ -94,8 +96,7 @@ public:
std::weak_ptr<Message> weakOf();
void append(std::unique_ptr<MessageElement> element);
QString matchLink(const QString &string);
void addLink(const QString &origLink, const QString &matchedLink);
void addLink(const ParsedLink &parsedLink);
/**
* Adds the text, applies irc colors, adds links,

View file

@ -15,9 +15,11 @@ bool LinkPredicate::appliesToImpl(const Message &message)
{
for (const auto &word : message.messageText.split(' ', Qt::SkipEmptyParts))
{
if (LinkParser(word).hasMatch())
if (LinkParser(word).result())
{
return true;
}
}
return false;
}

View file

@ -1,6 +1,7 @@
#include "providers/twitch/TwitchMessageBuilder.hpp"
#include "Application.hpp"
#include "common/LinkParser.hpp"
#include "common/QLogging.hpp"
#include "controllers/accounts/AccountController.hpp"
#include "controllers/ignores/IgnoreController.hpp"
@ -465,12 +466,12 @@ void TwitchMessageBuilder::addTextOrEmoji(const QString &string_)
}
// Actually just text
auto linkString = this->matchLink(string);
LinkParser parsed(string);
auto textColor = this->textColor_;
if (!linkString.isEmpty())
if (parsed.result())
{
this->addLink(string, linkString);
this->addLink(*parsed.result());
return;
}

View file

@ -6,66 +6,96 @@
using namespace chatterino;
struct Case {
QString protocol{};
QString host{};
QString rest{};
void check() const
{
auto input = this->protocol + this->host + this->rest;
LinkParser p(input);
ASSERT_TRUE(p.result().has_value()) << input.toStdString();
const auto &r = *p.result();
ASSERT_EQ(r.source, input);
ASSERT_EQ(r.protocol, this->protocol) << this->protocol.toStdString();
ASSERT_EQ(r.host, this->host) << this->host.toStdString();
ASSERT_EQ(r.rest, this->rest) << this->rest.toStdString();
}
};
TEST(LinkParser, parseDomainLinks)
{
const QStringList inputs = {
"https://chatterino.com",
"http://chatterino.com",
"chatterino.com",
"wiki.chatterino.com",
"https://wiki.chatterino.com",
"http://chatterino.co.uk",
"http://a.io",
"chatterino.com:80",
"wiki.chatterino.com:80",
"a.b.c.chatterino.com",
"https://a.b.c.chatterino.com/foo",
"http://chatterino.com?foo",
"http://xd.chatterino.com/#?foo",
"chatterino.com#foo",
"1.com",
"127.0.0.1.com",
"https://127.0.0.1.com",
const QList<Case> cases = {
{"https://", "chatterino.com"},
{"http://", "chatterino.com"},
{"", "chatterino.com"},
{"", "wiki.chatterino.com"},
{"https://", "wiki.chatterino.com"},
{"http://", "chatterino.co.uk"},
{"http://", "a.io"},
{"", "chatterino.com", ":80"},
{"", "wiki.chatterino.com", ":80"},
{"", "wiki.chatterino.com", ":80/foo/bar"},
{"", "wiki.chatterino.com", "/:80?foo/bar"},
{"", "wiki.chatterino.com", "/127.0.0.1"},
{"", "a.b.c.chatterino.com"},
{"https://", "a.b.c.chatterino.com", "/foo"},
{"http://", "chatterino.com", "?foo"},
{"http://", "xd.chatterino.com", "/#?foo"},
{"", "chatterino.com", "#foo"},
{"", "1.com"},
{"", "127.0.0.1.com"},
{"https://", "127.0.0.1.com"},
// test case-insensitiveness
{"HtTpS://", "127.0.0.1.CoM"},
{"HTTP://", "XD.CHATTERINO.COM", "/#?FOO"},
{"HTTPS://", "wikI.chatterino.com"},
{"", "chatterino.Org", "#foo"},
{"", "CHATTERINO.com", ""},
};
for (const auto &input : inputs)
for (const auto &c : cases)
{
LinkParser p(input);
ASSERT_TRUE(p.hasMatch()) << input.toStdString();
ASSERT_EQ(p.getCaptured(), input);
c.check();
}
}
TEST(LinkParser, parseIpv4Links)
{
const QStringList inputs = {
"https://127.0.0.1",
"http://127.0.0.1",
"127.0.0.1",
"127.0.0.1:8080",
"255.255.255.255",
"0.0.0.0",
"1.1.1.1",
"001.001.01.1",
"123.246.87.0",
"196.168.0.1:",
"196.168.4.2/foo",
"196.168.4.2?foo",
"http://196.168.4.0#foo",
"196.168.4.0/?#foo",
"196.168.4.0#?/foo",
"256.255.255.255",
"http://256.255.255.255",
"255.256.255.255",
"255.255.256.255",
"255.255.255.256",
const QList<Case> cases = {
{"https://", "127.0.0.1"},
{"http://", "127.0.0.1"},
{"", "127.0.0.1"},
{"", "127.0.0.1", ":8080"},
{"", "255.255.255.255"},
{"", "0.0.0.0"},
{"", "1.1.1.1"},
{"", "001.001.01.1"},
{"", "123.246.87.0"},
{"", "196.168.0.1", ":"},
{"", "196.168.4.2", "/foo"},
{"", "196.168.4.2", "?foo"},
{"http://", "196.168.4.0", "#foo"},
{"", "196.168.4.0", "/?#foo"},
{"", "196.168.4.0", "#?/foo"},
{"", "256.255.255.255"},
{"http://", "256.255.255.255"},
{"", "255.256.255.255"},
{"", "255.255.256.255"},
{"", "255.255.255.256"},
// test case-insensitiveness
{"HTTP://", "196.168.4.0", "#Foo"},
{"HTTPS://", "196.168.4.0", "#Foo"},
{"htTp://", "127.0.0.1"},
{"httpS://", "127.0.0.1"},
};
for (const auto &input : inputs)
for (const auto &c : cases)
{
LinkParser p(input);
ASSERT_TRUE(p.hasMatch()) << input.toStdString();
ASSERT_EQ(p.getCaptured(), input);
c.check();
}
}
@ -80,12 +110,14 @@ TEST(LinkParser, doesntParseInvalidIpv4Links)
"1.2",
"1",
"1.2.3",
"htt://256.255.255.255",
"aliens://256.255.255.255",
};
for (const auto &input : inputs)
{
LinkParser p(input);
ASSERT_FALSE(p.hasMatch()) << input.toStdString();
ASSERT_FALSE(p.result().has_value()) << input.toStdString();
}
}
@ -99,6 +131,9 @@ TEST(LinkParser, doesntParseInvalidLinks)
"spotify://chatterino.com",
"httpsx://chatterino.com",
"https:chatterino.com",
"https:/chatterino.com",
"http:/chatterino.com",
"htp://chatterino.com",
"/chatterino.com",
"word",
".",
@ -114,6 +149,6 @@ TEST(LinkParser, doesntParseInvalidLinks)
for (const auto &input : inputs)
{
LinkParser p(input);
ASSERT_FALSE(p.hasMatch()) << input.toStdString();
ASSERT_FALSE(p.result().has_value()) << input.toStdString();
}
}