feat: strip prefixes and suffixes in links (#5486)

This commit is contained in:
nerix 2024-07-14 11:17:42 +02:00 committed by GitHub
parent 973b7a3bdd
commit b9f669d3a5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 282 additions and 90 deletions

View file

@ -19,6 +19,7 @@
- Minor: Improve appearance of reply button. (#5491)
- Minor: Introduce HTTP API for plugins. (#5383, #5492, #5494)
- Minor: Support more Firefox variants for incognito link opening. (#5503)
- Minor: Links can now have prefixes and suffixes such as parentheses. (#5486)
- Bugfix: Fixed tab move animation occasionally failing to start after closing a tab. (#5426)
- Bugfix: If a network request errors with 200 OK, Qt's error code is now reported instead of the HTTP status. (#5378)
- Bugfix: Fixed restricted users usernames not being clickable. (#5405)

View file

@ -5,8 +5,6 @@
#include <QString>
#include <QStringList>
#include <optional>
using namespace chatterino;
const QString INPUT = QStringLiteral(
@ -14,8 +12,9 @@ const QString INPUT = QStringLiteral(
"(or 2.4.2 if its out) "
"https://github.com/Chatterino/chatterino2/releases/tag/nightly-build "
"AlienPls https://www.youtube.com/watch?v=ELBBiBDcWc0 "
"127.0.3 aaaa xd 256.256.256.256 AsdQwe xd 127.0.0.1 https://. https://.be "
"https://a http://a.b https://a.be ftp://xdd.com "
"127.0.3 aaaa xd 256.256.256.256 AsdQwe xd 127.0.0.1 https://. "
"*https://.be "
"https://a: http://a.b (https://a.be) ftp://xdd.com "
"this is a text lol . ://foo.com //aa.de :/foo.de xd.XDDDDDD ");
static void BM_LinkParsing(benchmark::State &state)
@ -24,15 +23,15 @@ static void BM_LinkParsing(benchmark::State &state)
// Make sure the TLDs are loaded
{
benchmark::DoNotOptimize(LinkParser("xd.com").result());
benchmark::DoNotOptimize(linkparser::parse("xd.com"));
}
for (auto _ : state)
{
for (auto word : words)
for (const auto &word : words)
{
LinkParser parser(word);
benchmark::DoNotOptimize(parser.result());
auto parsed = linkparser::parse(word);
benchmark::DoNotOptimize(parsed);
}
}
}

View file

@ -113,19 +113,58 @@ bool startsWithPort(QStringView string)
return true;
}
/// @brief Strips ignored characters off @a source
///
/// As per https://github.github.com/gfm/#autolinks-extension-:
///
/// '<', '*', '_', '~', and '(' are ignored at the beginning
/// '>', '?', '!', '.', ',', ':', '*', '~', and ')' are ignored at the end
///
/// A difference to GFM is that the source isn't scanned for parentheses and '_'
/// isn't a valid suffix.
void strip(QStringView &source)
{
while (!source.isEmpty())
{
auto c = source.first();
if (c == u'<' || c == u'*' || c == u'_' || c == u'~' || c == u'(')
{
source = source.mid(1);
continue;
}
break;
}
while (!source.isEmpty())
{
auto c = source.last();
if (c == u'>' || c == u'?' || c == u'!' || c == u'.' || c == u',' ||
c == u':' || c == u'*' || c == u'~' || c == u')')
{
source.chop(1);
continue;
}
break;
}
}
} // namespace
namespace chatterino {
namespace chatterino::linkparser {
LinkParser::LinkParser(const QString &unparsedString)
std::optional<Parsed> parse(const QString &source) noexcept
{
ParsedLink result;
std::optional<Parsed> result;
// This is not implemented with a regex to increase performance.
QStringView remaining(unparsedString);
QStringView protocol(remaining);
QStringView link{source};
strip(link);
QStringView remaining = link;
QStringView protocol;
// Check protocol for https?://
if (remaining.startsWith(QStringLiteral("http"), Qt::CaseInsensitive) &&
if (remaining.startsWith(u"http", Qt::CaseInsensitive) &&
remaining.length() >= 4 + 3 + 1) // 'http' + '://' + [any]
{
// optimistic view assuming there's a protocol (http or https)
@ -136,11 +175,11 @@ LinkParser::LinkParser(const QString &unparsedString)
withProto = withProto.mid(1);
}
if (withProto.startsWith(QStringLiteral("://")))
if (withProto.startsWith(u"://"))
{
// there's really a protocol => consume it
remaining = withProto.mid(3);
result.protocol = {protocol.begin(), remaining.begin()};
protocol = {link.begin(), remaining.begin()};
}
}
@ -161,7 +200,7 @@ LinkParser::LinkParser(const QString &unparsedString)
{
if (lastWasDot) // no double dots ..
{
return;
return result;
}
lastDotPos = i;
lastWasDot = true;
@ -181,7 +220,7 @@ LinkParser::LinkParser(const QString &unparsedString)
if (!startsWithPort(remaining))
{
return;
return result;
}
break;
@ -198,23 +237,22 @@ LinkParser::LinkParser(const QString &unparsedString)
if (lastWasDot || lastDotPos <= 0)
{
return;
return result;
}
// check host/tld
if ((nDots == 3 && isValidIpv4(host)) ||
isValidTld(host.mid(lastDotPos + 1)))
{
result.host = host;
result.rest = rest;
result.source = unparsedString;
this->result_ = std::move(result);
result = Parsed{
.protocol = protocol,
.host = host,
.rest = rest,
.link = link,
};
}
return result;
}
const std::optional<ParsedLink> &LinkParser::result() const
{
return this->result_;
}
} // namespace chatterino
} // namespace chatterino::linkparser

View file

@ -4,43 +4,126 @@
#include <optional>
namespace chatterino {
namespace chatterino::linkparser {
struct ParsedLink {
/// @brief Represents a parsed link
///
/// A parsed link is represented as views over the source string for its
/// different segments. In this simplified model, a link consists of an optional
/// @a protocol, a mandatory @a host and an optional @a rest. These segments are
/// always next to eachother in the input string, however together, they don't
/// span the whole input as it could contain prefixes or suffixes.
///
/// Prefixes and suffixes are almost identical to the ones in GitHub Flavored
/// Markdown (GFM - https://github.github.com/gfm/#autolinks-extension-).
/// The main differences are that '_' isn't a valid suffix and parentheses
/// aren't counted (e.g. "(a.com/(foo)! would result in "a.com/(foo").
/// Matching is done case insensitive (e.g. "HTTp://a.com" would be valid).
///
/// A @a protocol can either be empty, "http://", or "https://".
/// A @a host can either be an IPv4 address or a hostname. The hostname must end
/// in a valid top level domain. Otherwise, there are no restrictions on it.
/// The @a rest can start with an optional port followed by either a '/', '?',
/// or '#'.
///
/// @b Example
///
/// ```text
/// (https://wiki.chatterino.com/Help/#overview)
/// ▏▏proto ▕ host ▏ rest ▏▏
/// ▏▏ link ▏▏
/// ▏ source ▏
/// ```
struct Parsed {
/// The parsed protocol of the link. Can be empty.
///
/// ```text
/// https://www.forsen.tv/commands
/// ^------^
/// ▏╌╌╌╌╌╌▕
///
/// www.forsen.tv/commands
/// (empty)
/// ```
QStringView protocol;
/// The parsed host of the link. Can not be empty.
///
/// ```text
/// https://www.forsen.tv/commands
/// ^-----------^
/// ▏╌╌╌╌╌╌╌╌╌╌╌▕
/// ```
QStringView host;
/// The remainder of the link. Can be empty.
///
/// ```text
/// https://www.forsen.tv/commands
/// ^-------^
/// ▏╌╌╌╌╌╌╌▕
///
/// https://www.forsen.tv
/// (empty)
/// ```
QStringView rest;
/// The original unparsed link.
/// The matched link. Can not be empty.
///
/// ```text
/// (https://www.forsen.tv/commands)
/// ▏╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌▕
/// ```
QStringView link;
/// Checks if the parsed link contains a prefix
bool hasPrefix(const QString &source) const noexcept
{
return this->link.begin() != source.begin();
}
/// The prefix before the parsed link inside @a source. May be empty.
///
/// ```text
/// (https://www.forsen.tv/commands)
/// ^
///
/// https://www.forsen.tv/commands
/// ^----------------------------^
QString source;
/// (empty)
/// ```
QStringView prefix(const QString &source) const noexcept
{
return {source.data(), this->link.begin()};
}
/// Checks if the parsed link contains a suffix
bool hasSuffix(const QString &source) const noexcept
{
return this->link.end() != source.end();
}
/// The suffix after the parsed link inside @a source. May be empty.
///
/// ```text
/// (https://www.forsen.tv/commands)
/// ^
///
/// https://www.forsen.tv/commands
/// (empty)
/// ```
QStringView suffix(const QString &source) const noexcept
{
return {
this->link.begin() + this->link.size(),
source.data() + source.length(),
};
}
};
class LinkParser
{
public:
explicit LinkParser(const QString &unparsedString);
/// @brief Parses a link from @a source into its segments
///
/// If no link is contained in @a source, `std::nullopt` will be returned.
/// The returned value is valid as long as @a source exists, as it contains
/// views into @a source.
///
/// For the accepted links, see Parsed.
std::optional<Parsed> parse(const QString &source) noexcept;
const std::optional<ParsedLink> &result() const;
private:
std::optional<ParsedLink> result_{};
};
} // namespace chatterino
} // namespace chatterino::linkparser

View file

@ -152,10 +152,10 @@ bool appendWhisperMessageWordsLocally(const QStringList &words)
void operator()(const QString &string,
MessageBuilder &b) const
{
LinkParser parser(string);
if (parser.result())
auto link = linkparser::parse(string);
if (link)
{
b.addLink(*parser.result());
b.addLink(*link, string);
}
else
{

View file

@ -94,10 +94,10 @@ MessageBuilder::MessageBuilder(SystemMessageTag, const QString &text,
text.split(QRegularExpression("\\s"), Qt::SkipEmptyParts);
for (const auto &word : textFragments)
{
LinkParser parser(word);
if (parser.result())
auto link = linkparser::parse(word);
if (link)
{
this->addLink(*parser.result());
this->addLink(*link, word);
continue;
}
@ -637,30 +637,45 @@ std::unique_ptr<MessageElement> MessageBuilder::releaseBack()
return ptr;
}
void MessageBuilder::addLink(const ParsedLink &parsedLink)
void MessageBuilder::addLink(const linkparser::Parsed &parsedLink,
const QString &source)
{
QString lowercaseLinkString;
QString origLink = parsedLink.source;
QString origLink = parsedLink.link.toString();
QString fullUrl;
if (parsedLink.protocol.isNull())
{
fullUrl = QStringLiteral("http://") + parsedLink.source;
fullUrl = QStringLiteral("http://") + origLink;
}
else
{
lowercaseLinkString += parsedLink.protocol;
fullUrl = parsedLink.source;
fullUrl = origLink;
}
lowercaseLinkString += parsedLink.host.toString().toLower();
lowercaseLinkString += parsedLink.rest;
auto textColor = MessageColor(MessageColor::Link);
if (parsedLink.hasPrefix(source))
{
this->emplace<TextElement>(parsedLink.prefix(source).toString(),
MessageElementFlag::Text, this->textColor_)
->setTrailingSpace(false);
}
auto *el = this->emplace<LinkElement>(
LinkElement::Parsed{.lowercase = lowercaseLinkString,
.original = origLink},
fullUrl, MessageElementFlag::Text, textColor);
if (parsedLink.hasSuffix(source))
{
el->setTrailingSpace(false);
this->emplace<TextElement>(parsedLink.suffix(source).toString(),
MessageElementFlag::Text, this->textColor_);
}
getIApp()->getLinkResolver()->resolve(el->linkInfo());
}
@ -676,20 +691,18 @@ void MessageBuilder::addIrcMessageText(const QString &text)
int fg = -1;
int bg = -1;
for (const auto &word : words)
for (const auto &string : words)
{
if (word.isEmpty())
if (string.isEmpty())
{
continue;
}
auto string = QString(word);
// Actually just text
LinkParser parser(string);
if (parser.result())
auto link = linkparser::parse(string);
if (link)
{
this->addLink(*parser.result());
this->addLink(*link, string);
continue;
}
@ -772,15 +785,13 @@ void MessageBuilder::addTextOrEmoji(EmotePtr emote)
this->emplace<EmoteElement>(emote, MessageElementFlag::EmojiAll);
}
void MessageBuilder::addTextOrEmoji(const QString &string_)
void MessageBuilder::addTextOrEmoji(const QString &string)
{
auto string = QString(string_);
// Actually just text
LinkParser linkParser(string);
if (linkParser.result())
auto link = linkparser::parse(string);
if (link)
{
this->addLink(*linkParser.result());
this->addLink(*link, string);
return;
}

View file

@ -24,7 +24,9 @@ class TextElement;
struct Emote;
using EmotePtr = std::shared_ptr<const Emote>;
struct ParsedLink;
namespace linkparser {
struct Parsed;
} // namespace linkparser
struct SystemMessageTag {
};
@ -112,7 +114,7 @@ public:
std::weak_ptr<Message> weakOf();
void append(std::unique_ptr<MessageElement> element);
void addLink(const ParsedLink &parsedLink);
void addLink(const linkparser::Parsed &parsedLink, const QString &source);
/**
* Adds the text, applies irc colors, adds links,

View file

@ -14,7 +14,7 @@ bool LinkPredicate::appliesToImpl(const Message &message)
{
for (const auto &word : message.messageText.split(' ', Qt::SkipEmptyParts))
{
if (LinkParser(word).result())
if (linkparser::parse(word).has_value())
{
return true;
}

View file

@ -733,12 +733,12 @@ void TwitchMessageBuilder::addTextOrEmoji(const QString &string_)
}
// Actually just text
LinkParser parsed(string);
auto link = linkparser::parse(string);
auto textColor = this->textColor_;
if (parsed.result())
if (link)
{
this->addLink(*parsed.result());
this->addLink(*link, string);
return;
}

View file

@ -8,21 +8,52 @@
using namespace chatterino;
struct Case {
// -Wmissing-field-initializers complains otherwise
// NOLINTBEGIN(readability-redundant-member-init)
QString protocol{};
QString host{};
QString rest{};
// NOLINTEND(readability-redundant-member-init)
void check() const
{
auto input = this->protocol + this->host + this->rest;
LinkParser p(input);
ASSERT_TRUE(p.result().has_value()) << input;
QStringList prefixes{
"", "_", "__", "<", "<<", "<_<", "(((", "<*_~(", "**", "~~",
};
QStringList suffixes{
"", ">", "?", "!", ".", ",", ":",
"*", "~", ">>", "?!.", "~~,*!?", "**",
};
const auto &r = *p.result();
ASSERT_EQ(r.source, input);
ASSERT_EQ(r.protocol, this->protocol) << this->protocol;
ASSERT_EQ(r.host, this->host) << this->host;
ASSERT_EQ(r.rest, this->rest) << this->rest;
for (const auto &prefix : prefixes)
{
for (const auto &suffix : suffixes)
{
checkSingle(prefix, suffix);
}
}
}
void checkSingle(const QString &prefix, const QString &suffix) const
{
auto link = this->protocol + this->host + this->rest;
auto input = prefix + link + suffix;
auto p = linkparser::parse(input);
ASSERT_TRUE(p.has_value()) << input;
if (!p)
{
return;
}
ASSERT_EQ(p->link, link);
ASSERT_EQ(p->protocol, this->protocol);
ASSERT_EQ(p->host, this->host);
ASSERT_EQ(p->rest, this->rest);
ASSERT_EQ(p->prefix(input), prefix);
ASSERT_EQ(p->suffix(input), suffix);
ASSERT_EQ(p->hasPrefix(input), !prefix.isEmpty());
ASSERT_EQ(p->hasSuffix(input), !suffix.isEmpty());
}
};
@ -56,6 +87,10 @@ TEST(LinkParser, parseDomainLinks)
{"", "https.cat"},
{"", "httpsd.cat"},
{"", "http.cat", "/200"},
{"", "http.cat", "/200("},
{"", "a.com", "?("},
{"", "a.com", "#("},
{"", "a.com", "/__my_user__"},
// test case-insensitiveness
{"HtTpS://", "127.0.0.1.CoM"},
{"HTTP://", "XD.CHATTERINO.COM", "/#?FOO"},
@ -82,7 +117,6 @@ TEST(LinkParser, parseIpv4Links)
{"", "1.1.1.1"},
{"", "001.001.01.1"},
{"", "123.246.87.0"},
{"", "196.168.0.1", ":"},
{"", "196.168.4.2", "/foo"},
{"", "196.168.4.2", "?foo"},
{"http://", "196.168.4.0", "#foo"},
@ -122,12 +156,24 @@ TEST(LinkParser, doesntParseInvalidIpv4Links)
"255.256.255.255",
"255.255.256.255",
"255.255.255.256",
":127.0.0.1",
">1.2.3.4",
"?196.162.8.1",
"!196.162.8.1",
".196.162.8.1",
",196.162.8.1",
":196.162.8.1",
"+196.162.8.1",
"196.162.8.1<",
"196.162.8.1(())",
"196.162.8.1(",
"196.162.8.1(!",
};
for (const auto &input : inputs)
{
LinkParser p(input);
ASSERT_FALSE(p.result().has_value()) << input;
auto p = linkparser::parse(input);
ASSERT_FALSE(p.has_value()) << input;
}
}
@ -166,11 +212,23 @@ TEST(LinkParser, doesntParseInvalidLinks)
"http:/cat.com",
"http:/cat.com",
"https:/cat.com",
"chatterino.com-",
"<<>>",
">><<",
"a.com>><<",
"~~a.com()",
"https://chatterino.com><https://chatterino.com",
"<https://chatterino.com><https://chatterino.com>",
// invalid characters are still accepted (see #4769)
// "chatterino.com><chatterino.com",
// "https://chatterino.com><chatterino.com",
// "<chatterino.com><chatterino.com>",
// "<https://chatterino.com><chatterino.com>",
};
for (const auto &input : inputs)
{
LinkParser p(input);
ASSERT_FALSE(p.result().has_value()) << input;
auto p = linkparser::parse(input);
ASSERT_FALSE(p.has_value()) << input;
}
}