From 9c9fa86c45d5b1bc3fac6f1e84d6cedfbe96abef Mon Sep 17 00:00:00 2001 From: nerix Date: Sun, 23 Apr 2023 16:56:39 +0200 Subject: [PATCH] Refactor `LinkParser` (#4576) --- CHANGELOG.md | 1 + src/common/LinkParser.cpp | 300 +++++++++++++++++++++----------------- tests/src/LinkParser.cpp | 12 +- 3 files changed, 171 insertions(+), 142 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6db61c51a..c52f56d61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ - Bugfix: Fixed a memory leak that occurred when loading message history. This was mostly noticeable with unstable internet connections where reconnections were frequent or long-running instances of Chatterino. (#4499) - Bugfix: Fixed Twitch channel-specific filters not being applied correctly. (#4529) - Bugfix: Fixed emote & badge tooltips not showing up when thumbnails were hidden. (#4509) +- Bugfix: Fixed links with invalid IPv4 addresses being parsed. (#4576) - Dev: Disabling precompiled headers on Windows is now tested in CI. (#4472) - Dev: Themes are now stored as JSON files in `resources/themes`. (#4471, #4533) - Dev: Ignore unhandled BTTV user-events. (#4438) diff --git a/src/common/LinkParser.cpp b/src/common/LinkParser.cpp index 5587189c8..81304b86d 100644 --- a/src/common/LinkParser.cpp +++ b/src/common/LinkParser.cpp @@ -1,206 +1,232 @@ +#define QT_NO_CAST_FROM_ASCII // avoids unexpected implicit casts #include "common/LinkParser.hpp" #include -#include -#include #include #include -#include +#include #include -namespace chatterino { namespace { - QSet &tlds() - { - static QSet tlds = [] { - QFile file(":/tlds.txt"); - file.open(QFile::ReadOnly); - QTextStream stream(&file); + +QSet &tlds() +{ + static QSet tlds = [] { + QFile file(QStringLiteral(":/tlds.txt")); + file.open(QFile::ReadOnly); + QTextStream stream(&file); #if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0) - // Default encoding of QTextStream is already UTF-8, at least in Qt6 + // Default encoding of QTextStream is already UTF-8, at least in Qt6 #else - stream.setCodec("UTF-8"); + stream.setCodec("UTF-8"); #endif - int safetyMax = 20000; + int safetyMax = 20000; - QSet set; + QSet set; - while (!stream.atEnd()) + while (!stream.atEnd()) + { + auto line = stream.readLine(); + set.insert(line); + + if (safetyMax-- == 0) { - auto line = stream.readLine(); - set.insert(line); - - if (safetyMax-- == 0) - break; + break; } + } - return set; - }(); - return tlds; - } + return set; + }(); + return tlds; +} - bool isValidHostname(QStringRef &host) +bool isValidTld(QStringView tld) +{ + return tlds().contains(tld.toString().toLower()); +} + +bool isValidIpv4(QStringView host) +{ + // We don't care about the actual value, + // we only want to verify the ip. + + char16_t sectionValue = 0; // 0..256 + uint8_t octetNumber = 0; // 0..4 + uint8_t sectionDigits = 0; // 0..3 + bool lastWasDot = true; + + for (auto c : host) { - int index = host.lastIndexOf('.'); + char16_t current = c.unicode(); + if (current == '.') + { + if (lastWasDot || octetNumber == 3) + { + return false; + } + lastWasDot = true; + octetNumber++; + sectionValue = 0; + sectionDigits = 0; + continue; + } + lastWasDot = false; - return index != -1 && - tlds().contains(host.mid(index + 1).toString().toLower()); + if (current > u'9' || current < u'0') + { + return false; + } + + sectionValue = sectionValue * 10 + (current - u'0'); + sectionDigits++; + if (sectionValue >= 256 || sectionDigits > 3) + { + return false; + } } - bool isValidIpv4(QStringRef &host) + return octetNumber == 3 && !lastWasDot; +} + +/** + * @brief Checks if the string starts with a port number. + * + * The value of the port number isn't checked. A port in this implementation + * can be in the range 0..100'000. + */ +bool startsWithPort(QStringView string) +{ + for (qsizetype i = 0; i < std::min(5, string.length()); i++) { - static auto exp = QRegularExpression("^\\d{1,3}(?:\\.\\d{1,3}){3}$"); + char16_t c = string[i].unicode(); + if (i >= 1 && (c == u'/' || c == u'?' || c == u'#')) + { + return true; + } - return exp.match(host).hasMatch(); + if (!string[i].isDigit()) + { + return false; + } } + return true; +} -#ifdef C_MATCH_IPV6_LINK - bool isValidIpv6(QStringRef &host) - { - static auto exp = QRegularExpression("^\\[[a-fA-F0-9:%]+\\]$"); - - return exp.match(host).hasMatch(); - } -#endif } // namespace +namespace chatterino { + LinkParser::LinkParser(const QString &unparsedString) { ParsedLink result; - // This is not implemented with a regex to increase performance. - // We keep removing parts of the url until there's either nothing left or we fail. - QStringRef l(&unparsedString); + QStringView remaining(unparsedString); + QStringView protocol(remaining); - bool hasHttp = false; +#if QT_VERSION < QT_VERSION_CHECK(5, 15, 0) + QStringView wholeString(unparsedString); + const auto refFromView = [&](QStringView view) { + return QStringRef(&unparsedString, + static_cast(view.begin() - wholeString.begin()), + static_cast(view.size())); + }; +#endif - // Protocol `https?://` - if (l.startsWith("https://", Qt::CaseInsensitive)) + // Check protocol for https?:// + if (remaining.startsWith(QStringLiteral("http"), Qt::CaseInsensitive) && + remaining.length() >= 4 + 3 + 1) // 'http' + '://' + [any] { - hasHttp = true; - result.protocol = l.mid(0, 8); - l = l.mid(8); - } - else if (l.startsWith("http://", Qt::CaseInsensitive)) - { - hasHttp = true; - result.protocol = l.mid(0, 7); - l = l.mid(7); + remaining = remaining.mid(4); // 'http' + + if (remaining[0] == QChar(u's') || remaining[0] == QChar(u'S')) + { + remaining = remaining.mid(1); + } + + if (remaining.startsWith(QStringLiteral("://"))) + { + remaining = remaining.mid(3); +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) + result.protocol = {protocol.begin(), remaining.begin()}; +#else + result.protocol = + refFromView({protocol.begin(), remaining.begin()}); +#endif + } } - // Http basic auth `user:password`. - // Not supported for security reasons (misleading links) + // Http basic auth `user:password` isn't supported for security reasons (misleading links) // Host `a.b.c.com` - QStringRef host = l; - ParsedLink::StringView rest; + QStringView host = remaining; + QStringView rest; bool lastWasDot = true; - bool inIpv6 = false; - bool hasMatch = false; + int lastDotPos = -1; + int nDots = 0; - for (int i = 0; i < l.size(); i++) + // Extract the host + for (int i = 0; i < remaining.size(); i++) { - if (l[i] == '.') + char16_t currentChar = remaining[i].unicode(); + if (currentChar == u'.') { - if (lastWasDot == true) // no double dots .. - goto error; + if (lastWasDot) // no double dots .. + { + return; + } + lastDotPos = i; lastWasDot = true; + nDots++; } else { lastWasDot = false; } - if (l[i] == ':' && !inIpv6) + // found a port + if (currentChar == u':') { - host = l.mid(0, i); - rest = l.mid(i); - l = l.mid(i + 1); - goto parsePort; - } - else if (l[i] == '/') - { - host = l.mid(0, i); - rest = l.mid(i); - l = l.mid(i + 1); - goto parsePath; - } - else if (l[i] == '?') - { - host = l.mid(0, i); - rest = l.mid(i); - l = l.mid(i + 1); - goto parseQuery; - } - else if (l[i] == '#') - { - host = l.mid(0, i); - rest = l.mid(i); - l = l.mid(i + 1); - goto parseAnchor; + host = remaining.mid(0, i); + rest = remaining.mid(i); + remaining = remaining.mid(i + 1); + + if (!startsWithPort(remaining)) + { + return; + } + + break; } - // ipv6 - if (l[i] == '[') + // we accept everything in the path/query/anchor + if (currentChar == u'/' || currentChar == u'?' || currentChar == u'#') { - if (i == 0) - inIpv6 = true; - else - goto error; - } - else if (l[i] == ']') - { - inIpv6 = false; + host = remaining.mid(0, i); + rest = remaining.mid(i); + break; } } - if (lastWasDot) - goto error; - else - goto done; - -parsePort: - // Port `:12345` - for (int i = 0; i < std::min(5, l.size()); i++) + if (lastWasDot || lastDotPos <= 0) { - if (l[i] == '/') - goto parsePath; - else if (l[i] == '?') - goto parseQuery; - else if (l[i] == '#') - goto parseAnchor; - - if (!l[i].isDigit()) - goto error; + return; } - goto done; - -parsePath: -parseQuery: -parseAnchor: - // we accept everything in the path/query/anchor - -done: - // check host - hasMatch = isValidHostname(host) || isValidIpv4(host) -#ifdef C_MATCH_IPV6_LINK - - || (hasHttp && isValidIpv6(host)) -#endif - ; - - if (hasMatch) + // check host/tld + if ((nDots == 3 && isValidIpv4(host)) || + isValidTld(host.mid(lastDotPos + 1))) { +#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0) result.host = host; result.rest = rest; +#else + result.host = refFromView(host); + result.rest = refFromView(rest); +#endif result.source = unparsedString; this->result_ = std::move(result); } - -error: - return; } const std::optional &LinkParser::result() const diff --git a/tests/src/LinkParser.cpp b/tests/src/LinkParser.cpp index aaf35e957..38dc32b48 100644 --- a/tests/src/LinkParser.cpp +++ b/tests/src/LinkParser.cpp @@ -80,11 +80,6 @@ TEST(LinkParser, parseIpv4Links) {"http://", "196.168.4.0", "#foo"}, {"", "196.168.4.0", "/?#foo"}, {"", "196.168.4.0", "#?/foo"}, - {"", "256.255.255.255"}, - {"http://", "256.255.255.255"}, - {"", "255.256.255.255"}, - {"", "255.255.256.255"}, - {"", "255.255.255.256"}, // test case-insensitiveness {"HTTP://", "196.168.4.0", "#Foo"}, {"HTTPS://", "196.168.4.0", "#Foo"}, @@ -102,6 +97,8 @@ TEST(LinkParser, parseIpv4Links) TEST(LinkParser, doesntParseInvalidIpv4Links) { const QStringList inputs = { + // U+0660 - in category "number digits" + QStringLiteral("٠.٠.٠.٠"), "https://127.0.0.", "http://127.0.01", "127.0.0000.1", @@ -112,6 +109,11 @@ TEST(LinkParser, doesntParseInvalidIpv4Links) "1.2.3", "htt://256.255.255.255", "aliens://256.255.255.255", + "256.255.255.255", + "http://256.255.255.255", + "255.256.255.255", + "255.255.256.255", + "255.255.255.256", }; for (const auto &input : inputs)