fix: count parentheses in links (#5515)

This commit is contained in:
nerix 2024-07-21 16:00:12 +02:00 committed by GitHub
parent 5deec1f02f
commit 847e4f0932
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 60 additions and 11 deletions

View file

@ -22,7 +22,7 @@
- Minor: Introduce HTTP API for plugins. (#5383, #5492, #5494)
- Minor: Support more Firefox variants for incognito link opening. (#5503)
- Minor: Replying to a message will now display the message being replied to. (#4350, #5519)
- Minor: Links can now have prefixes and suffixes such as parentheses. (#5486)
- Minor: Links can now have prefixes and suffixes such as parentheses. (#5486, #5515)
- Bugfix: Fixed tab move animation occasionally failing to start after closing a tab. (#5426)
- Bugfix: If a network request errors with 200 OK, Qt's error code is now reported instead of the HTTP status. (#5378)
- Bugfix: Fixed restricted users usernames not being clickable. (#5405)

View file

@ -118,10 +118,12 @@ bool startsWithPort(QStringView string)
/// As per https://github.github.com/gfm/#autolinks-extension-:
///
/// '<', '*', '_', '~', and '(' are ignored at the beginning
/// '>', '?', '!', '.', ',', ':', '*', '~', and ')' are ignored at the end
/// '>', '?', '!', '.', ',', ':', '*', '~', and ')' are ignored at the end.
///
/// A difference to GFM is that the source isn't scanned for parentheses and '_'
/// isn't a valid suffix.
/// A difference to GFM is that '_' isn't a valid suffix.
///
/// This might remove more than desired (e.g. "(a.com/(foo))" -> "a.com/(foo").
/// Parentheses are counted after recognizing a valid IP/host.
void strip(QStringView &source)
{
while (!source.isEmpty())
@ -259,6 +261,43 @@ std::optional<Parsed> parse(const QString &source) noexcept
if ((nDots == 3 && isValidIpv4(host)) ||
isValidTld(host.mid(lastDotPos + 1)))
{
// scan for parentheses (only if there were characters excluded)
if (link.end() != source.end() && !rest.empty())
{
size_t nestingLevel = 0;
// position after the last closing brace (i.e. the minimum characters to include)
const auto *lastClose = link.end();
// scan source from rest until the end:
// lastClose
// v
// (example.com/foo/bar/#baz_(qox)),
// ▏╌╌rest (before)╌ ▏
// ▏╌╌╌╌╌╌╌link (before)╌╌╌╌╌╌╌ ▏
// ▏╌╌rest (after)╌╌╌ ▏
// ▏╌╌╌╌╌╌╌link (after)╌╌╌╌╌╌╌╌╌ ▏
// ▏╌╌╌╌╌╌╌╌╌╌╌╌╌source╌╌╌╌╌╌╌╌╌╌╌╌ ▏
// ▏╌╌╌╌╌╌╌search╌╌╌╌╌╌ ▏
for (const auto *it = rest.begin(); it < source.end(); it++)
{
if (it->unicode() == u'(')
{
nestingLevel++;
continue;
}
if (nestingLevel != 0 && it->unicode() == u')')
{
nestingLevel--;
if (nestingLevel == 0)
{
lastClose = it + 1;
}
}
}
link = QStringView{link.begin(), std::max(link.end(), lastClose)};
rest = QStringView{rest.begin(), std::max(rest.end(), lastClose)};
}
result = Parsed{
.protocol = protocol,
.host = host,

View file

@ -16,8 +16,9 @@ namespace chatterino::linkparser {
///
/// Prefixes and suffixes are almost identical to the ones in GitHub Flavored
/// Markdown (GFM - https://github.github.com/gfm/#autolinks-extension-).
/// The main differences are that '_' isn't a valid suffix and parentheses
/// aren't counted (e.g. "(a.com/(foo)! would result in "a.com/(foo").
/// The main difference is that '_' isn't a valid suffix.
/// Parentheses are counted inside the @a rest: parsing "(a.com/(foo))" would
/// result in the link "a.com/(foo)".
/// Matching is done case insensitive (e.g. "HTTp://a.com" would be valid).
///
/// A @a protocol can either be empty, "http://", or "https://".

View file

@ -23,8 +23,8 @@ struct Case {
"", "_", "__", "<", "<<", "<_<", "(((", "<*_~(", "**", "~~",
};
QStringList suffixes{
"", ">", "?", "!", ".", ",", ":",
"*", "~", ">>", "?!.", "~~,*!?", "**",
"", ">", "?", "!", ".", ",", ":", "*", "~",
">>", "?!.", "~~,*!?", "**", ").", "),", ",)", ")),.", ")?",
};
for (const auto &prefix : prefixes)
@ -89,14 +89,21 @@ TEST(LinkParser, parseDomainLinks)
{"", "https.cat"},
{"", "httpsd.cat"},
{"", "http.cat", "/200"},
{"", "http.cat", "/200("},
{"", "a.com", "?("},
{"", "a.com", "#("},
{"", "http.cat", "/200()"},
{"", "a.com", "?()"},
{"", "a.com", "#()"},
{"", "a.com", "/__my_user__"},
{"", "a.b.c.-._.1.com", ""},
{"", "0123456789.com", ""},
{"", "ABCDEFGHIJKLMNOPQRSTUVWXYZ.com", ""},
{"", "abcdefghijklmnopqrstuvwxyz.com", ""},
{"", "example.com", "/foo(bar)"},
{"", "example.com", "/foo((bar))"},
{"", "example.com", "/(f)(o)(o)(b)(a)r"},
{"", "example.com", "/foobar()()"},
{"", "example.com", "/foobar()(())baz"},
{"", "example.com", "/(foo)"},
{"", "example.com", "/()"},
// non-ASCII characters are allowed
{"", u"köln.de"_s, ""},
{"", u"ü.com"_s, ""},
@ -246,6 +253,8 @@ TEST(LinkParser, doesntParseInvalidLinks)
"@@@.com",
"%%%.com",
"*.com",
"example.com(foo)",
"example.com()",
};
for (const auto &input : inputs)