Refactor LinkParser (#4576)

This commit is contained in:
nerix 2023-04-23 16:56:39 +02:00 committed by GitHub
parent 4d0e4c1fca
commit 9c9fa86c45
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 171 additions and 142 deletions

View file

@ -26,6 +26,7 @@
- Bugfix: Fixed a memory leak that occurred when loading message history. This was mostly noticeable with unstable internet connections where reconnections were frequent or long-running instances of Chatterino. (#4499) - Bugfix: Fixed a memory leak that occurred when loading message history. This was mostly noticeable with unstable internet connections where reconnections were frequent or long-running instances of Chatterino. (#4499)
- Bugfix: Fixed Twitch channel-specific filters not being applied correctly. (#4529) - Bugfix: Fixed Twitch channel-specific filters not being applied correctly. (#4529)
- Bugfix: Fixed emote & badge tooltips not showing up when thumbnails were hidden. (#4509) - Bugfix: Fixed emote & badge tooltips not showing up when thumbnails were hidden. (#4509)
- Bugfix: Fixed links with invalid IPv4 addresses being parsed. (#4576)
- Dev: Disabling precompiled headers on Windows is now tested in CI. (#4472) - Dev: Disabling precompiled headers on Windows is now tested in CI. (#4472)
- Dev: Themes are now stored as JSON files in `resources/themes`. (#4471, #4533) - Dev: Themes are now stored as JSON files in `resources/themes`. (#4471, #4533)
- Dev: Ignore unhandled BTTV user-events. (#4438) - Dev: Ignore unhandled BTTV user-events. (#4438)

View file

@ -1,206 +1,232 @@
#define QT_NO_CAST_FROM_ASCII // avoids unexpected implicit casts
#include "common/LinkParser.hpp" #include "common/LinkParser.hpp"
#include <QFile> #include <QFile>
#include <QMap>
#include <QRegularExpression>
#include <QSet> #include <QSet>
#include <QString> #include <QString>
#include <QStringRef> #include <QStringView>
#include <QTextStream> #include <QTextStream>
namespace chatterino {
namespace { namespace {
QSet<QString> &tlds()
{ QSet<QString> &tlds()
static QSet<QString> tlds = [] { {
QFile file(":/tlds.txt"); static QSet<QString> tlds = [] {
file.open(QFile::ReadOnly); QFile file(QStringLiteral(":/tlds.txt"));
QTextStream stream(&file); file.open(QFile::ReadOnly);
QTextStream stream(&file);
#if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0) #if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0)
// Default encoding of QTextStream is already UTF-8, at least in Qt6 // Default encoding of QTextStream is already UTF-8, at least in Qt6
#else #else
stream.setCodec("UTF-8"); stream.setCodec("UTF-8");
#endif #endif
int safetyMax = 20000; int safetyMax = 20000;
QSet<QString> set; QSet<QString> set;
while (!stream.atEnd()) while (!stream.atEnd())
{
auto line = stream.readLine();
set.insert(line);
if (safetyMax-- == 0)
{ {
auto line = stream.readLine(); break;
set.insert(line);
if (safetyMax-- == 0)
break;
} }
}
return set; return set;
}(); }();
return tlds; return tlds;
} }
bool isValidHostname(QStringRef &host) bool isValidTld(QStringView tld)
{
return tlds().contains(tld.toString().toLower());
}
bool isValidIpv4(QStringView host)
{
// We don't care about the actual value,
// we only want to verify the ip.
char16_t sectionValue = 0; // 0..256
uint8_t octetNumber = 0; // 0..4
uint8_t sectionDigits = 0; // 0..3
bool lastWasDot = true;
for (auto c : host)
{ {
int index = host.lastIndexOf('.'); char16_t current = c.unicode();
if (current == '.')
{
if (lastWasDot || octetNumber == 3)
{
return false;
}
lastWasDot = true;
octetNumber++;
sectionValue = 0;
sectionDigits = 0;
continue;
}
lastWasDot = false;
return index != -1 && if (current > u'9' || current < u'0')
tlds().contains(host.mid(index + 1).toString().toLower()); {
return false;
}
sectionValue = sectionValue * 10 + (current - u'0');
sectionDigits++;
if (sectionValue >= 256 || sectionDigits > 3)
{
return false;
}
} }
bool isValidIpv4(QStringRef &host) return octetNumber == 3 && !lastWasDot;
}
/**
* @brief Checks if the string starts with a port number.
*
* The value of the port number isn't checked. A port in this implementation
* can be in the range 0..100'000.
*/
bool startsWithPort(QStringView string)
{
for (qsizetype i = 0; i < std::min<qsizetype>(5, string.length()); i++)
{ {
static auto exp = QRegularExpression("^\\d{1,3}(?:\\.\\d{1,3}){3}$"); char16_t c = string[i].unicode();
if (i >= 1 && (c == u'/' || c == u'?' || c == u'#'))
{
return true;
}
return exp.match(host).hasMatch(); if (!string[i].isDigit())
{
return false;
}
} }
return true;
}
#ifdef C_MATCH_IPV6_LINK
bool isValidIpv6(QStringRef &host)
{
static auto exp = QRegularExpression("^\\[[a-fA-F0-9:%]+\\]$");
return exp.match(host).hasMatch();
}
#endif
} // namespace } // namespace
namespace chatterino {
LinkParser::LinkParser(const QString &unparsedString) LinkParser::LinkParser(const QString &unparsedString)
{ {
ParsedLink result; ParsedLink result;
// This is not implemented with a regex to increase performance. // This is not implemented with a regex to increase performance.
// We keep removing parts of the url until there's either nothing left or we fail. QStringView remaining(unparsedString);
QStringRef l(&unparsedString); QStringView protocol(remaining);
bool hasHttp = false; #if QT_VERSION < QT_VERSION_CHECK(5, 15, 0)
QStringView wholeString(unparsedString);
const auto refFromView = [&](QStringView view) {
return QStringRef(&unparsedString,
static_cast<int>(view.begin() - wholeString.begin()),
static_cast<int>(view.size()));
};
#endif
// Protocol `https?://` // Check protocol for https?://
if (l.startsWith("https://", Qt::CaseInsensitive)) if (remaining.startsWith(QStringLiteral("http"), Qt::CaseInsensitive) &&
remaining.length() >= 4 + 3 + 1) // 'http' + '://' + [any]
{ {
hasHttp = true; remaining = remaining.mid(4); // 'http'
result.protocol = l.mid(0, 8);
l = l.mid(8); if (remaining[0] == QChar(u's') || remaining[0] == QChar(u'S'))
} {
else if (l.startsWith("http://", Qt::CaseInsensitive)) remaining = remaining.mid(1);
{ }
hasHttp = true;
result.protocol = l.mid(0, 7); if (remaining.startsWith(QStringLiteral("://")))
l = l.mid(7); {
remaining = remaining.mid(3);
#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0)
result.protocol = {protocol.begin(), remaining.begin()};
#else
result.protocol =
refFromView({protocol.begin(), remaining.begin()});
#endif
}
} }
// Http basic auth `user:password`. // Http basic auth `user:password` isn't supported for security reasons (misleading links)
// Not supported for security reasons (misleading links)
// Host `a.b.c.com` // Host `a.b.c.com`
QStringRef host = l; QStringView host = remaining;
ParsedLink::StringView rest; QStringView rest;
bool lastWasDot = true; bool lastWasDot = true;
bool inIpv6 = false; int lastDotPos = -1;
bool hasMatch = false; int nDots = 0;
for (int i = 0; i < l.size(); i++) // Extract the host
for (int i = 0; i < remaining.size(); i++)
{ {
if (l[i] == '.') char16_t currentChar = remaining[i].unicode();
if (currentChar == u'.')
{ {
if (lastWasDot == true) // no double dots .. if (lastWasDot) // no double dots ..
goto error; {
return;
}
lastDotPos = i;
lastWasDot = true; lastWasDot = true;
nDots++;
} }
else else
{ {
lastWasDot = false; lastWasDot = false;
} }
if (l[i] == ':' && !inIpv6) // found a port
if (currentChar == u':')
{ {
host = l.mid(0, i); host = remaining.mid(0, i);
rest = l.mid(i); rest = remaining.mid(i);
l = l.mid(i + 1); remaining = remaining.mid(i + 1);
goto parsePort;
} if (!startsWithPort(remaining))
else if (l[i] == '/') {
{ return;
host = l.mid(0, i); }
rest = l.mid(i);
l = l.mid(i + 1); break;
goto parsePath;
}
else if (l[i] == '?')
{
host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1);
goto parseQuery;
}
else if (l[i] == '#')
{
host = l.mid(0, i);
rest = l.mid(i);
l = l.mid(i + 1);
goto parseAnchor;
} }
// ipv6 // we accept everything in the path/query/anchor
if (l[i] == '[') if (currentChar == u'/' || currentChar == u'?' || currentChar == u'#')
{ {
if (i == 0) host = remaining.mid(0, i);
inIpv6 = true; rest = remaining.mid(i);
else break;
goto error;
}
else if (l[i] == ']')
{
inIpv6 = false;
} }
} }
if (lastWasDot) if (lastWasDot || lastDotPos <= 0)
goto error;
else
goto done;
parsePort:
// Port `:12345`
for (int i = 0; i < std::min<int>(5, l.size()); i++)
{ {
if (l[i] == '/') return;
goto parsePath;
else if (l[i] == '?')
goto parseQuery;
else if (l[i] == '#')
goto parseAnchor;
if (!l[i].isDigit())
goto error;
} }
goto done; // check host/tld
if ((nDots == 3 && isValidIpv4(host)) ||
parsePath: isValidTld(host.mid(lastDotPos + 1)))
parseQuery:
parseAnchor:
// we accept everything in the path/query/anchor
done:
// check host
hasMatch = isValidHostname(host) || isValidIpv4(host)
#ifdef C_MATCH_IPV6_LINK
|| (hasHttp && isValidIpv6(host))
#endif
;
if (hasMatch)
{ {
#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0)
result.host = host; result.host = host;
result.rest = rest; result.rest = rest;
#else
result.host = refFromView(host);
result.rest = refFromView(rest);
#endif
result.source = unparsedString; result.source = unparsedString;
this->result_ = std::move(result); this->result_ = std::move(result);
} }
error:
return;
} }
const std::optional<ParsedLink> &LinkParser::result() const const std::optional<ParsedLink> &LinkParser::result() const

View file

@ -80,11 +80,6 @@ TEST(LinkParser, parseIpv4Links)
{"http://", "196.168.4.0", "#foo"}, {"http://", "196.168.4.0", "#foo"},
{"", "196.168.4.0", "/?#foo"}, {"", "196.168.4.0", "/?#foo"},
{"", "196.168.4.0", "#?/foo"}, {"", "196.168.4.0", "#?/foo"},
{"", "256.255.255.255"},
{"http://", "256.255.255.255"},
{"", "255.256.255.255"},
{"", "255.255.256.255"},
{"", "255.255.255.256"},
// test case-insensitiveness // test case-insensitiveness
{"HTTP://", "196.168.4.0", "#Foo"}, {"HTTP://", "196.168.4.0", "#Foo"},
{"HTTPS://", "196.168.4.0", "#Foo"}, {"HTTPS://", "196.168.4.0", "#Foo"},
@ -102,6 +97,8 @@ TEST(LinkParser, parseIpv4Links)
TEST(LinkParser, doesntParseInvalidIpv4Links) TEST(LinkParser, doesntParseInvalidIpv4Links)
{ {
const QStringList inputs = { const QStringList inputs = {
// U+0660 - in category "number digits"
QStringLiteral("٠.٠.٠.٠"),
"https://127.0.0.", "https://127.0.0.",
"http://127.0.01", "http://127.0.01",
"127.0.0000.1", "127.0.0000.1",
@ -112,6 +109,11 @@ TEST(LinkParser, doesntParseInvalidIpv4Links)
"1.2.3", "1.2.3",
"htt://256.255.255.255", "htt://256.255.255.255",
"aliens://256.255.255.255", "aliens://256.255.255.255",
"256.255.255.255",
"http://256.255.255.255",
"255.256.255.255",
"255.255.256.255",
"255.255.255.256",
}; };
for (const auto &input : inputs) for (const auto &input : inputs)