mirror of
https://github.com/Chatterino/chatterino2.git
synced 2024-11-21 22:24:07 +01:00
Refactor LinkParser
(#4576)
This commit is contained in:
parent
4d0e4c1fca
commit
9c9fa86c45
3 changed files with 171 additions and 142 deletions
|
@ -26,6 +26,7 @@
|
||||||
- Bugfix: Fixed a memory leak that occurred when loading message history. This was mostly noticeable with unstable internet connections where reconnections were frequent or long-running instances of Chatterino. (#4499)
|
- Bugfix: Fixed a memory leak that occurred when loading message history. This was mostly noticeable with unstable internet connections where reconnections were frequent or long-running instances of Chatterino. (#4499)
|
||||||
- Bugfix: Fixed Twitch channel-specific filters not being applied correctly. (#4529)
|
- Bugfix: Fixed Twitch channel-specific filters not being applied correctly. (#4529)
|
||||||
- Bugfix: Fixed emote & badge tooltips not showing up when thumbnails were hidden. (#4509)
|
- Bugfix: Fixed emote & badge tooltips not showing up when thumbnails were hidden. (#4509)
|
||||||
|
- Bugfix: Fixed links with invalid IPv4 addresses being parsed. (#4576)
|
||||||
- Dev: Disabling precompiled headers on Windows is now tested in CI. (#4472)
|
- Dev: Disabling precompiled headers on Windows is now tested in CI. (#4472)
|
||||||
- Dev: Themes are now stored as JSON files in `resources/themes`. (#4471, #4533)
|
- Dev: Themes are now stored as JSON files in `resources/themes`. (#4471, #4533)
|
||||||
- Dev: Ignore unhandled BTTV user-events. (#4438)
|
- Dev: Ignore unhandled BTTV user-events. (#4438)
|
||||||
|
|
|
@ -1,206 +1,232 @@
|
||||||
|
#define QT_NO_CAST_FROM_ASCII // avoids unexpected implicit casts
|
||||||
#include "common/LinkParser.hpp"
|
#include "common/LinkParser.hpp"
|
||||||
|
|
||||||
#include <QFile>
|
#include <QFile>
|
||||||
#include <QMap>
|
|
||||||
#include <QRegularExpression>
|
|
||||||
#include <QSet>
|
#include <QSet>
|
||||||
#include <QString>
|
#include <QString>
|
||||||
#include <QStringRef>
|
#include <QStringView>
|
||||||
#include <QTextStream>
|
#include <QTextStream>
|
||||||
|
|
||||||
namespace chatterino {
|
|
||||||
namespace {
|
namespace {
|
||||||
QSet<QString> &tlds()
|
|
||||||
{
|
QSet<QString> &tlds()
|
||||||
static QSet<QString> tlds = [] {
|
{
|
||||||
QFile file(":/tlds.txt");
|
static QSet<QString> tlds = [] {
|
||||||
file.open(QFile::ReadOnly);
|
QFile file(QStringLiteral(":/tlds.txt"));
|
||||||
QTextStream stream(&file);
|
file.open(QFile::ReadOnly);
|
||||||
|
QTextStream stream(&file);
|
||||||
|
|
||||||
#if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0)
|
#if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0)
|
||||||
// Default encoding of QTextStream is already UTF-8, at least in Qt6
|
// Default encoding of QTextStream is already UTF-8, at least in Qt6
|
||||||
#else
|
#else
|
||||||
stream.setCodec("UTF-8");
|
stream.setCodec("UTF-8");
|
||||||
#endif
|
#endif
|
||||||
int safetyMax = 20000;
|
int safetyMax = 20000;
|
||||||
|
|
||||||
QSet<QString> set;
|
QSet<QString> set;
|
||||||
|
|
||||||
while (!stream.atEnd())
|
while (!stream.atEnd())
|
||||||
|
{
|
||||||
|
auto line = stream.readLine();
|
||||||
|
set.insert(line);
|
||||||
|
|
||||||
|
if (safetyMax-- == 0)
|
||||||
{
|
{
|
||||||
auto line = stream.readLine();
|
break;
|
||||||
set.insert(line);
|
|
||||||
|
|
||||||
if (safetyMax-- == 0)
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return set;
|
return set;
|
||||||
}();
|
}();
|
||||||
return tlds;
|
return tlds;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isValidHostname(QStringRef &host)
|
bool isValidTld(QStringView tld)
|
||||||
|
{
|
||||||
|
return tlds().contains(tld.toString().toLower());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isValidIpv4(QStringView host)
|
||||||
|
{
|
||||||
|
// We don't care about the actual value,
|
||||||
|
// we only want to verify the ip.
|
||||||
|
|
||||||
|
char16_t sectionValue = 0; // 0..256
|
||||||
|
uint8_t octetNumber = 0; // 0..4
|
||||||
|
uint8_t sectionDigits = 0; // 0..3
|
||||||
|
bool lastWasDot = true;
|
||||||
|
|
||||||
|
for (auto c : host)
|
||||||
{
|
{
|
||||||
int index = host.lastIndexOf('.');
|
char16_t current = c.unicode();
|
||||||
|
if (current == '.')
|
||||||
|
{
|
||||||
|
if (lastWasDot || octetNumber == 3)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
lastWasDot = true;
|
||||||
|
octetNumber++;
|
||||||
|
sectionValue = 0;
|
||||||
|
sectionDigits = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lastWasDot = false;
|
||||||
|
|
||||||
return index != -1 &&
|
if (current > u'9' || current < u'0')
|
||||||
tlds().contains(host.mid(index + 1).toString().toLower());
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
sectionValue = sectionValue * 10 + (current - u'0');
|
||||||
|
sectionDigits++;
|
||||||
|
if (sectionValue >= 256 || sectionDigits > 3)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isValidIpv4(QStringRef &host)
|
return octetNumber == 3 && !lastWasDot;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Checks if the string starts with a port number.
|
||||||
|
*
|
||||||
|
* The value of the port number isn't checked. A port in this implementation
|
||||||
|
* can be in the range 0..100'000.
|
||||||
|
*/
|
||||||
|
bool startsWithPort(QStringView string)
|
||||||
|
{
|
||||||
|
for (qsizetype i = 0; i < std::min<qsizetype>(5, string.length()); i++)
|
||||||
{
|
{
|
||||||
static auto exp = QRegularExpression("^\\d{1,3}(?:\\.\\d{1,3}){3}$");
|
char16_t c = string[i].unicode();
|
||||||
|
if (i >= 1 && (c == u'/' || c == u'?' || c == u'#'))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return exp.match(host).hasMatch();
|
if (!string[i].isDigit())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef C_MATCH_IPV6_LINK
|
|
||||||
bool isValidIpv6(QStringRef &host)
|
|
||||||
{
|
|
||||||
static auto exp = QRegularExpression("^\\[[a-fA-F0-9:%]+\\]$");
|
|
||||||
|
|
||||||
return exp.match(host).hasMatch();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
namespace chatterino {
|
||||||
|
|
||||||
LinkParser::LinkParser(const QString &unparsedString)
|
LinkParser::LinkParser(const QString &unparsedString)
|
||||||
{
|
{
|
||||||
ParsedLink result;
|
ParsedLink result;
|
||||||
|
|
||||||
// This is not implemented with a regex to increase performance.
|
// This is not implemented with a regex to increase performance.
|
||||||
// We keep removing parts of the url until there's either nothing left or we fail.
|
QStringView remaining(unparsedString);
|
||||||
QStringRef l(&unparsedString);
|
QStringView protocol(remaining);
|
||||||
|
|
||||||
bool hasHttp = false;
|
#if QT_VERSION < QT_VERSION_CHECK(5, 15, 0)
|
||||||
|
QStringView wholeString(unparsedString);
|
||||||
|
const auto refFromView = [&](QStringView view) {
|
||||||
|
return QStringRef(&unparsedString,
|
||||||
|
static_cast<int>(view.begin() - wholeString.begin()),
|
||||||
|
static_cast<int>(view.size()));
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
// Protocol `https?://`
|
// Check protocol for https?://
|
||||||
if (l.startsWith("https://", Qt::CaseInsensitive))
|
if (remaining.startsWith(QStringLiteral("http"), Qt::CaseInsensitive) &&
|
||||||
|
remaining.length() >= 4 + 3 + 1) // 'http' + '://' + [any]
|
||||||
{
|
{
|
||||||
hasHttp = true;
|
remaining = remaining.mid(4); // 'http'
|
||||||
result.protocol = l.mid(0, 8);
|
|
||||||
l = l.mid(8);
|
if (remaining[0] == QChar(u's') || remaining[0] == QChar(u'S'))
|
||||||
}
|
{
|
||||||
else if (l.startsWith("http://", Qt::CaseInsensitive))
|
remaining = remaining.mid(1);
|
||||||
{
|
}
|
||||||
hasHttp = true;
|
|
||||||
result.protocol = l.mid(0, 7);
|
if (remaining.startsWith(QStringLiteral("://")))
|
||||||
l = l.mid(7);
|
{
|
||||||
|
remaining = remaining.mid(3);
|
||||||
|
#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0)
|
||||||
|
result.protocol = {protocol.begin(), remaining.begin()};
|
||||||
|
#else
|
||||||
|
result.protocol =
|
||||||
|
refFromView({protocol.begin(), remaining.begin()});
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Http basic auth `user:password`.
|
// Http basic auth `user:password` isn't supported for security reasons (misleading links)
|
||||||
// Not supported for security reasons (misleading links)
|
|
||||||
|
|
||||||
// Host `a.b.c.com`
|
// Host `a.b.c.com`
|
||||||
QStringRef host = l;
|
QStringView host = remaining;
|
||||||
ParsedLink::StringView rest;
|
QStringView rest;
|
||||||
bool lastWasDot = true;
|
bool lastWasDot = true;
|
||||||
bool inIpv6 = false;
|
int lastDotPos = -1;
|
||||||
bool hasMatch = false;
|
int nDots = 0;
|
||||||
|
|
||||||
for (int i = 0; i < l.size(); i++)
|
// Extract the host
|
||||||
|
for (int i = 0; i < remaining.size(); i++)
|
||||||
{
|
{
|
||||||
if (l[i] == '.')
|
char16_t currentChar = remaining[i].unicode();
|
||||||
|
if (currentChar == u'.')
|
||||||
{
|
{
|
||||||
if (lastWasDot == true) // no double dots ..
|
if (lastWasDot) // no double dots ..
|
||||||
goto error;
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
lastDotPos = i;
|
||||||
lastWasDot = true;
|
lastWasDot = true;
|
||||||
|
nDots++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
lastWasDot = false;
|
lastWasDot = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (l[i] == ':' && !inIpv6)
|
// found a port
|
||||||
|
if (currentChar == u':')
|
||||||
{
|
{
|
||||||
host = l.mid(0, i);
|
host = remaining.mid(0, i);
|
||||||
rest = l.mid(i);
|
rest = remaining.mid(i);
|
||||||
l = l.mid(i + 1);
|
remaining = remaining.mid(i + 1);
|
||||||
goto parsePort;
|
|
||||||
}
|
if (!startsWithPort(remaining))
|
||||||
else if (l[i] == '/')
|
{
|
||||||
{
|
return;
|
||||||
host = l.mid(0, i);
|
}
|
||||||
rest = l.mid(i);
|
|
||||||
l = l.mid(i + 1);
|
break;
|
||||||
goto parsePath;
|
|
||||||
}
|
|
||||||
else if (l[i] == '?')
|
|
||||||
{
|
|
||||||
host = l.mid(0, i);
|
|
||||||
rest = l.mid(i);
|
|
||||||
l = l.mid(i + 1);
|
|
||||||
goto parseQuery;
|
|
||||||
}
|
|
||||||
else if (l[i] == '#')
|
|
||||||
{
|
|
||||||
host = l.mid(0, i);
|
|
||||||
rest = l.mid(i);
|
|
||||||
l = l.mid(i + 1);
|
|
||||||
goto parseAnchor;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ipv6
|
// we accept everything in the path/query/anchor
|
||||||
if (l[i] == '[')
|
if (currentChar == u'/' || currentChar == u'?' || currentChar == u'#')
|
||||||
{
|
{
|
||||||
if (i == 0)
|
host = remaining.mid(0, i);
|
||||||
inIpv6 = true;
|
rest = remaining.mid(i);
|
||||||
else
|
break;
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
else if (l[i] == ']')
|
|
||||||
{
|
|
||||||
inIpv6 = false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lastWasDot)
|
if (lastWasDot || lastDotPos <= 0)
|
||||||
goto error;
|
|
||||||
else
|
|
||||||
goto done;
|
|
||||||
|
|
||||||
parsePort:
|
|
||||||
// Port `:12345`
|
|
||||||
for (int i = 0; i < std::min<int>(5, l.size()); i++)
|
|
||||||
{
|
{
|
||||||
if (l[i] == '/')
|
return;
|
||||||
goto parsePath;
|
|
||||||
else if (l[i] == '?')
|
|
||||||
goto parseQuery;
|
|
||||||
else if (l[i] == '#')
|
|
||||||
goto parseAnchor;
|
|
||||||
|
|
||||||
if (!l[i].isDigit())
|
|
||||||
goto error;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
goto done;
|
// check host/tld
|
||||||
|
if ((nDots == 3 && isValidIpv4(host)) ||
|
||||||
parsePath:
|
isValidTld(host.mid(lastDotPos + 1)))
|
||||||
parseQuery:
|
|
||||||
parseAnchor:
|
|
||||||
// we accept everything in the path/query/anchor
|
|
||||||
|
|
||||||
done:
|
|
||||||
// check host
|
|
||||||
hasMatch = isValidHostname(host) || isValidIpv4(host)
|
|
||||||
#ifdef C_MATCH_IPV6_LINK
|
|
||||||
|
|
||||||
|| (hasHttp && isValidIpv6(host))
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
if (hasMatch)
|
|
||||||
{
|
{
|
||||||
|
#if QT_VERSION >= QT_VERSION_CHECK(5, 15, 0)
|
||||||
result.host = host;
|
result.host = host;
|
||||||
result.rest = rest;
|
result.rest = rest;
|
||||||
|
#else
|
||||||
|
result.host = refFromView(host);
|
||||||
|
result.rest = refFromView(rest);
|
||||||
|
#endif
|
||||||
result.source = unparsedString;
|
result.source = unparsedString;
|
||||||
this->result_ = std::move(result);
|
this->result_ = std::move(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
error:
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::optional<ParsedLink> &LinkParser::result() const
|
const std::optional<ParsedLink> &LinkParser::result() const
|
||||||
|
|
|
@ -80,11 +80,6 @@ TEST(LinkParser, parseIpv4Links)
|
||||||
{"http://", "196.168.4.0", "#foo"},
|
{"http://", "196.168.4.0", "#foo"},
|
||||||
{"", "196.168.4.0", "/?#foo"},
|
{"", "196.168.4.0", "/?#foo"},
|
||||||
{"", "196.168.4.0", "#?/foo"},
|
{"", "196.168.4.0", "#?/foo"},
|
||||||
{"", "256.255.255.255"},
|
|
||||||
{"http://", "256.255.255.255"},
|
|
||||||
{"", "255.256.255.255"},
|
|
||||||
{"", "255.255.256.255"},
|
|
||||||
{"", "255.255.255.256"},
|
|
||||||
// test case-insensitiveness
|
// test case-insensitiveness
|
||||||
{"HTTP://", "196.168.4.0", "#Foo"},
|
{"HTTP://", "196.168.4.0", "#Foo"},
|
||||||
{"HTTPS://", "196.168.4.0", "#Foo"},
|
{"HTTPS://", "196.168.4.0", "#Foo"},
|
||||||
|
@ -102,6 +97,8 @@ TEST(LinkParser, parseIpv4Links)
|
||||||
TEST(LinkParser, doesntParseInvalidIpv4Links)
|
TEST(LinkParser, doesntParseInvalidIpv4Links)
|
||||||
{
|
{
|
||||||
const QStringList inputs = {
|
const QStringList inputs = {
|
||||||
|
// U+0660 - in category "number digits"
|
||||||
|
QStringLiteral("٠.٠.٠.٠"),
|
||||||
"https://127.0.0.",
|
"https://127.0.0.",
|
||||||
"http://127.0.01",
|
"http://127.0.01",
|
||||||
"127.0.0000.1",
|
"127.0.0000.1",
|
||||||
|
@ -112,6 +109,11 @@ TEST(LinkParser, doesntParseInvalidIpv4Links)
|
||||||
"1.2.3",
|
"1.2.3",
|
||||||
"htt://256.255.255.255",
|
"htt://256.255.255.255",
|
||||||
"aliens://256.255.255.255",
|
"aliens://256.255.255.255",
|
||||||
|
"256.255.255.255",
|
||||||
|
"http://256.255.255.255",
|
||||||
|
"255.256.255.255",
|
||||||
|
"255.255.256.255",
|
||||||
|
"255.255.255.256",
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto &input : inputs)
|
for (const auto &input : inputs)
|
||||||
|
|
Loading…
Reference in a new issue