feat: strip prefixes and suffixes in links (#5486)

2024-11-21 22:24:07 +01:00 · 2024-07-14 11:17:42 +02:00 · 2024-07-14 11:17:42 +02:00 · b9f669d3a5
commit b9f669d3a5
parent 973b7a3bdd
10 changed files with 282 additions and 90 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -19,6 +19,7 @@
 - Minor: Improve appearance of reply button. (#5491)
 - Minor: Introduce HTTP API for plugins. (#5383, #5492, #5494)
 - Minor: Support more Firefox variants for incognito link opening. (#5503)
+- Minor: Links can now have prefixes and suffixes such as parentheses. (#5486)
 - Bugfix: Fixed tab move animation occasionally failing to start after closing a tab. (#5426)
 - Bugfix: If a network request errors with 200 OK, Qt's error code is now reported instead of the HTTP status. (#5378)
 - Bugfix: Fixed restricted users usernames not being clickable. (#5405)
--- a/benchmarks/src/LinkParser.cpp
+++ b/benchmarks/src/LinkParser.cpp
@ -5,8 +5,6 @@
 #include <QString>
 #include <QStringList>

-#include <optional>
-
 using namespace chatterino;

 const QString INPUT = QStringLiteral(
@ -14,8 +12,9 @@ const QString INPUT = QStringLiteral(
    "(or 2.4.2 if its out) "
    "https://github.com/Chatterino/chatterino2/releases/tag/nightly-build "
    "AlienPls https://www.youtube.com/watch?v=ELBBiBDcWc0 "
-    "127.0.3 aaaa xd 256.256.256.256 AsdQwe xd 127.0.0.1 https://. https://.be "
-    "https://a http://a.b https://a.be ftp://xdd.com "
+    "127.0.3 aaaa xd 256.256.256.256 AsdQwe xd 127.0.0.1 https://. "
+    "*https://.be "
+    "https://a: http://a.b (https://a.be) ftp://xdd.com "
    "this is a text lol . ://foo.com //aa.de :/foo.de xd.XDDDDDD ");

 static void BM_LinkParsing(benchmark::State &state)
@ -24,15 +23,15 @@ static void BM_LinkParsing(benchmark::State &state)

    // Make sure the TLDs are loaded
    {
-        benchmark::DoNotOptimize(LinkParser("xd.com").result());
+        benchmark::DoNotOptimize(linkparser::parse("xd.com"));
    }

    for (auto _ : state)
    {
-        for (auto word : words)
+        for (const auto &word : words)
        {
-            LinkParser parser(word);
-            benchmark::DoNotOptimize(parser.result());
+            auto parsed = linkparser::parse(word);
+            benchmark::DoNotOptimize(parsed);
        }
    }
 }
--- a/src/common/LinkParser.cpp
+++ b/src/common/LinkParser.cpp
@ -113,19 +113,58 @@ bool startsWithPort(QStringView string)
    return true;
 }

+/// @brief Strips ignored characters off @a source
+///
+/// As per https://github.github.com/gfm/#autolinks-extension-:
+///
+/// '<', '*', '_', '~', and '(' are ignored at the beginning
+/// '>', '?', '!', '.', ',', ':', '*', '~', and ')' are ignored at the end
+///
+/// A difference to GFM is that the source isn't scanned for parentheses and '_'
+/// isn't a valid suffix.
+void strip(QStringView &source)
+{
+    while (!source.isEmpty())
+    {
+        auto c = source.first();
+        if (c == u'<' || c == u'*' || c == u'_' || c == u'~' || c == u'(')
+        {
+            source = source.mid(1);
+            continue;
+        }
+        break;
+    }
+
+    while (!source.isEmpty())
+    {
+        auto c = source.last();
+        if (c == u'>' || c == u'?' || c == u'!' || c == u'.' || c == u',' ||
+            c == u':' || c == u'*' || c == u'~' || c == u')')
+        {
+            source.chop(1);
+            continue;
+        }
+        break;
+    }
+}
+
 }  // namespace

-namespace chatterino {
+namespace chatterino::linkparser {

-LinkParser::LinkParser(const QString &unparsedString)
+std::optional<Parsed> parse(const QString &source) noexcept
 {
-    ParsedLink result;
+    std::optional<Parsed> result;
    // This is not implemented with a regex to increase performance.
-    QStringView remaining(unparsedString);
-    QStringView protocol(remaining);
+
+    QStringView link{source};
+    strip(link);
+
+    QStringView remaining = link;
+    QStringView protocol;

    // Check protocol for https?://
-    if (remaining.startsWith(QStringLiteral("http"), Qt::CaseInsensitive) &&
+    if (remaining.startsWith(u"http", Qt::CaseInsensitive) &&
        remaining.length() >= 4 + 3 + 1)  // 'http' + '://' + [any]
    {
        // optimistic view assuming there's a protocol (http or https)
@ -136,11 +175,11 @@ LinkParser::LinkParser(const QString &unparsedString)
            withProto = withProto.mid(1);
        }

-        if (withProto.startsWith(QStringLiteral("://")))
+        if (withProto.startsWith(u"://"))
        {
            // there's really a protocol => consume it
            remaining = withProto.mid(3);
-            result.protocol = {protocol.begin(), remaining.begin()};
+            protocol = {link.begin(), remaining.begin()};
        }
    }

@ -161,7 +200,7 @@ LinkParser::LinkParser(const QString &unparsedString)
        {
            if (lastWasDot)  // no double dots ..
            {
-                return;
+                return result;
            }
            lastDotPos = i;
            lastWasDot = true;
@ -181,7 +220,7 @@ LinkParser::LinkParser(const QString &unparsedString)

            if (!startsWithPort(remaining))
            {
-                return;
+                return result;
            }

            break;
@ -198,23 +237,22 @@ LinkParser::LinkParser(const QString &unparsedString)

    if (lastWasDot || lastDotPos <= 0)
    {
-        return;
+        return result;
    }

    // check host/tld
    if ((nDots == 3 && isValidIpv4(host)) ||
        isValidTld(host.mid(lastDotPos + 1)))
    {
-        result.host = host;
-        result.rest = rest;
-        result.source = unparsedString;
-        this->result_ = std::move(result);
+        result = Parsed{
+            .protocol = protocol,
+            .host = host,
+            .rest = rest,
+            .link = link,
+        };
    }
+
+    return result;
 }

-const std::optional<ParsedLink> &LinkParser::result() const
-{
-    return this->result_;
-}
-
-}  // namespace chatterino
+}  // namespace chatterino::linkparser
--- a/src/common/LinkParser.hpp
+++ b/src/common/LinkParser.hpp
@ -4,43 +4,126 @@

 #include <optional>

-namespace chatterino {
+namespace chatterino::linkparser {

-struct ParsedLink {
+/// @brief Represents a parsed link
+///
+/// A parsed link is represented as views over the source string for its
+/// different segments. In this simplified model, a link consists of an optional
+/// @a protocol, a mandatory @a host and an optional @a rest. These segments are
+/// always next to eachother in the input string, however together, they don't
+/// span the whole input as it could contain prefixes or suffixes.
+///
+/// Prefixes and suffixes are almost identical to the ones in GitHub Flavored
+/// Markdown (GFM - https://github.github.com/gfm/#autolinks-extension-).
+/// The main differences are that '_' isn't a valid suffix and parentheses
+/// aren't counted (e.g. "(a.com/(foo)! would result in "a.com/(foo").
+/// Matching is done case insensitive (e.g. "HTTp://a.com" would be valid).
+///
+/// A @a protocol can either be empty, "http://", or "https://".
+/// A @a host can either be an IPv4 address or a hostname. The hostname must end
+/// in a valid top level domain. Otherwise, there are no restrictions on it.
+/// The @a rest can start with an optional port followed by either a '/', '?',
+/// or '#'.
+///
+/// @b Example
+///
+/// ```text
+/// (https://wiki.chatterino.com/Help/#overview)
+/// ▏▏proto ▕       host        ▏     rest     ▏▏
+/// ▏▏                link                     ▏▏
+/// ▏                source                     ▏
+/// ```
+struct Parsed {
    /// The parsed protocol of the link. Can be empty.
    ///
+    /// ```text
    /// https://www.forsen.tv/commands
-    /// ^------^
+    /// ▏╌╌╌╌╌╌▕
+    ///
+    /// www.forsen.tv/commands
+    /// (empty)
+    /// ```
    QStringView protocol;

    /// The parsed host of the link. Can not be empty.
    ///
+    /// ```text
    /// https://www.forsen.tv/commands
-    ///         ^-----------^
+    ///         ▏╌╌╌╌╌╌╌╌╌╌╌▕
+    /// ```
    QStringView host;

    /// The remainder of the link. Can be empty.
    ///
+    /// ```text
    /// https://www.forsen.tv/commands
-    ///                      ^-------^
+    ///                      ▏╌╌╌╌╌╌╌▕
+    ///
+    /// https://www.forsen.tv
+    /// (empty)
+    /// ```
    QStringView rest;

-    /// The original unparsed link.
+    /// The matched link. Can not be empty.
+    ///
+    /// ```text
+    /// (https://www.forsen.tv/commands)
+    ///  ▏╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌▕
+    /// ```
+    QStringView link;
+
+    /// Checks if the parsed link contains a prefix
+    bool hasPrefix(const QString &source) const noexcept
+    {
+        return this->link.begin() != source.begin();
+    }
+
+    /// The prefix before the parsed link inside @a source. May be empty.
+    ///
+    /// ```text
+    /// (https://www.forsen.tv/commands)
+    /// ^
    ///
    /// https://www.forsen.tv/commands
-    /// ^----------------------------^
-    QString source;
+    /// (empty)
+    /// ```
+    QStringView prefix(const QString &source) const noexcept
+    {
+        return {source.data(), this->link.begin()};
+    }
+
+    /// Checks if the parsed link contains a suffix
+    bool hasSuffix(const QString &source) const noexcept
+    {
+        return this->link.end() != source.end();
+    }
+
+    /// The suffix after the parsed link inside @a source. May be empty.
+    ///
+    /// ```text
+    /// (https://www.forsen.tv/commands)
+    ///                                ^
+    ///
+    /// https://www.forsen.tv/commands
+    /// (empty)
+    /// ```
+    QStringView suffix(const QString &source) const noexcept
+    {
+        return {
+            this->link.begin() + this->link.size(),
+            source.data() + source.length(),
+        };
+    }
 };

-class LinkParser
-{
-public:
-    explicit LinkParser(const QString &unparsedString);
+/// @brief Parses a link from @a source into its segments
+///
+/// If no link is contained in @a source, `std::nullopt` will be returned.
+/// The returned value is valid as long as @a source exists, as it contains
+/// views into @a source.
+///
+/// For the accepted links, see Parsed.
+std::optional<Parsed> parse(const QString &source) noexcept;

-    const std::optional<ParsedLink> &result() const;
-
-private:
-    std::optional<ParsedLink> result_{};
-};
-
-}  // namespace chatterino
+}  // namespace chatterino::linkparser
--- a/src/controllers/commands/builtin/twitch/SendWhisper.cpp
+++ b/src/controllers/commands/builtin/twitch/SendWhisper.cpp
@ -152,10 +152,10 @@ bool appendWhisperMessageWordsLocally(const QStringList &words)
                    void operator()(const QString &string,
                                    MessageBuilder &b) const
                    {
-                        LinkParser parser(string);
-                        if (parser.result())
+                        auto link = linkparser::parse(string);
+                        if (link)
                        {
-                            b.addLink(*parser.result());
+                            b.addLink(*link, string);
                        }
                        else
                        {
--- a/src/messages/MessageBuilder.cpp
+++ b/src/messages/MessageBuilder.cpp
@ -94,10 +94,10 @@ MessageBuilder::MessageBuilder(SystemMessageTag, const QString &text,
        text.split(QRegularExpression("\\s"), Qt::SkipEmptyParts);
    for (const auto &word : textFragments)
    {
-        LinkParser parser(word);
-        if (parser.result())
+        auto link = linkparser::parse(word);
+        if (link)
        {
-            this->addLink(*parser.result());
+            this->addLink(*link, word);
            continue;
        }

@ -637,30 +637,45 @@ std::unique_ptr<MessageElement> MessageBuilder::releaseBack()
    return ptr;
 }

-void MessageBuilder::addLink(const ParsedLink &parsedLink)
+void MessageBuilder::addLink(const linkparser::Parsed &parsedLink,
+                             const QString &source)
 {
    QString lowercaseLinkString;
-    QString origLink = parsedLink.source;
+    QString origLink = parsedLink.link.toString();
    QString fullUrl;

    if (parsedLink.protocol.isNull())
    {
-        fullUrl = QStringLiteral("http://") + parsedLink.source;
+        fullUrl = QStringLiteral("http://") + origLink;
    }
    else
    {
        lowercaseLinkString += parsedLink.protocol;
-        fullUrl = parsedLink.source;
+        fullUrl = origLink;
    }

    lowercaseLinkString += parsedLink.host.toString().toLower();
    lowercaseLinkString += parsedLink.rest;

    auto textColor = MessageColor(MessageColor::Link);
+
+    if (parsedLink.hasPrefix(source))
+    {
+        this->emplace<TextElement>(parsedLink.prefix(source).toString(),
+                                   MessageElementFlag::Text, this->textColor_)
+            ->setTrailingSpace(false);
+    }
    auto *el = this->emplace<LinkElement>(
        LinkElement::Parsed{.lowercase = lowercaseLinkString,
                            .original = origLink},
        fullUrl, MessageElementFlag::Text, textColor);
+    if (parsedLink.hasSuffix(source))
+    {
+        el->setTrailingSpace(false);
+        this->emplace<TextElement>(parsedLink.suffix(source).toString(),
+                                   MessageElementFlag::Text, this->textColor_);
+    }
+
    getIApp()->getLinkResolver()->resolve(el->linkInfo());
 }

@ -676,20 +691,18 @@ void MessageBuilder::addIrcMessageText(const QString &text)
    int fg = -1;
    int bg = -1;

-    for (const auto &word : words)
+    for (const auto &string : words)
    {
-        if (word.isEmpty())
+        if (string.isEmpty())
        {
            continue;
        }

-        auto string = QString(word);
-
        // Actually just text
-        LinkParser parser(string);
-        if (parser.result())
+        auto link = linkparser::parse(string);
+        if (link)
        {
-            this->addLink(*parser.result());
+            this->addLink(*link, string);
            continue;
        }

@ -772,15 +785,13 @@ void MessageBuilder::addTextOrEmoji(EmotePtr emote)
    this->emplace<EmoteElement>(emote, MessageElementFlag::EmojiAll);
 }

-void MessageBuilder::addTextOrEmoji(const QString &string_)
+void MessageBuilder::addTextOrEmoji(const QString &string)
 {
-    auto string = QString(string_);
-
    // Actually just text
-    LinkParser linkParser(string);
-    if (linkParser.result())
+    auto link = linkparser::parse(string);
+    if (link)
    {
-        this->addLink(*linkParser.result());
+        this->addLink(*link, string);
        return;
    }

--- a/src/messages/MessageBuilder.hpp
+++ b/src/messages/MessageBuilder.hpp
@ -24,7 +24,9 @@ class TextElement;
 struct Emote;
 using EmotePtr = std::shared_ptr<const Emote>;

-struct ParsedLink;
+namespace linkparser {
+    struct Parsed;
+}  // namespace linkparser

 struct SystemMessageTag {
 };
@ -112,7 +114,7 @@ public:
    std::weak_ptr<Message> weakOf();

    void append(std::unique_ptr<MessageElement> element);
-    void addLink(const ParsedLink &parsedLink);
+    void addLink(const linkparser::Parsed &parsedLink, const QString &source);

    /**
     * Adds the text, applies irc colors, adds links,
--- a/src/messages/search/LinkPredicate.cpp
+++ b/src/messages/search/LinkPredicate.cpp
@ -14,7 +14,7 @@ bool LinkPredicate::appliesToImpl(const Message &message)
 {
    for (const auto &word : message.messageText.split(' ', Qt::SkipEmptyParts))
    {
-        if (LinkParser(word).result())
+        if (linkparser::parse(word).has_value())
        {
            return true;
        }
--- a/src/providers/twitch/TwitchMessageBuilder.cpp
+++ b/src/providers/twitch/TwitchMessageBuilder.cpp
@ -733,12 +733,12 @@ void TwitchMessageBuilder::addTextOrEmoji(const QString &string_)
    }

    // Actually just text
-    LinkParser parsed(string);
+    auto link = linkparser::parse(string);
    auto textColor = this->textColor_;

-    if (parsed.result())
+    if (link)
    {
-        this->addLink(*parsed.result());
+        this->addLink(*link, string);
        return;
    }

--- a/tests/src/LinkParser.cpp
+++ b/tests/src/LinkParser.cpp
@ -8,21 +8,52 @@
 using namespace chatterino;

 struct Case {
+    // -Wmissing-field-initializers complains otherwise
+    // NOLINTBEGIN(readability-redundant-member-init)
    QString protocol{};
    QString host{};
    QString rest{};
+    // NOLINTEND(readability-redundant-member-init)

    void check() const
    {
-        auto input = this->protocol + this->host + this->rest;
-        LinkParser p(input);
-        ASSERT_TRUE(p.result().has_value()) << input;
+        QStringList prefixes{
+            "", "_", "__", "<", "<<", "<_<", "(((", "<*_~(", "**", "~~",
+        };
+        QStringList suffixes{
+            "",  ">", "?",  "!",   ".",      ",",  ":",
+            "*", "~", ">>", "?!.", "~~,*!?", "**",
+        };

-        const auto &r = *p.result();
-        ASSERT_EQ(r.source, input);
-        ASSERT_EQ(r.protocol, this->protocol) << this->protocol;
-        ASSERT_EQ(r.host, this->host) << this->host;
-        ASSERT_EQ(r.rest, this->rest) << this->rest;
+        for (const auto &prefix : prefixes)
+        {
+            for (const auto &suffix : suffixes)
+            {
+                checkSingle(prefix, suffix);
+            }
+        }
+    }
+
+    void checkSingle(const QString &prefix, const QString &suffix) const
+    {
+        auto link = this->protocol + this->host + this->rest;
+        auto input = prefix + link + suffix;
+        auto p = linkparser::parse(input);
+        ASSERT_TRUE(p.has_value()) << input;
+
+        if (!p)
+        {
+            return;
+        }
+
+        ASSERT_EQ(p->link, link);
+        ASSERT_EQ(p->protocol, this->protocol);
+        ASSERT_EQ(p->host, this->host);
+        ASSERT_EQ(p->rest, this->rest);
+        ASSERT_EQ(p->prefix(input), prefix);
+        ASSERT_EQ(p->suffix(input), suffix);
+        ASSERT_EQ(p->hasPrefix(input), !prefix.isEmpty());
+        ASSERT_EQ(p->hasSuffix(input), !suffix.isEmpty());
    }
 };

@ -56,6 +87,10 @@ TEST(LinkParser, parseDomainLinks)
        {"", "https.cat"},
        {"", "httpsd.cat"},
        {"", "http.cat", "/200"},
+        {"", "http.cat", "/200("},
+        {"", "a.com", "?("},
+        {"", "a.com", "#("},
+        {"", "a.com", "/__my_user__"},
        // test case-insensitiveness
        {"HtTpS://", "127.0.0.1.CoM"},
        {"HTTP://", "XD.CHATTERINO.COM", "/#?FOO"},
@ -82,7 +117,6 @@ TEST(LinkParser, parseIpv4Links)
        {"", "1.1.1.1"},
        {"", "001.001.01.1"},
        {"", "123.246.87.0"},
-        {"", "196.168.0.1", ":"},
        {"", "196.168.4.2", "/foo"},
        {"", "196.168.4.2", "?foo"},
        {"http://", "196.168.4.0", "#foo"},
@ -122,12 +156,24 @@ TEST(LinkParser, doesntParseInvalidIpv4Links)
        "255.256.255.255",
        "255.255.256.255",
        "255.255.255.256",
+        ":127.0.0.1",
+        ">1.2.3.4",
+        "?196.162.8.1",
+        "!196.162.8.1",
+        ".196.162.8.1",
+        ",196.162.8.1",
+        ":196.162.8.1",
+        "+196.162.8.1",
+        "196.162.8.1<",
+        "196.162.8.1(())",
+        "196.162.8.1(",
+        "196.162.8.1(!",
    };

    for (const auto &input : inputs)
    {
-        LinkParser p(input);
-        ASSERT_FALSE(p.result().has_value()) << input;
+        auto p = linkparser::parse(input);
+        ASSERT_FALSE(p.has_value()) << input;
    }
 }

@ -166,11 +212,23 @@ TEST(LinkParser, doesntParseInvalidLinks)
        "http:/cat.com",
        "http:/cat.com",
        "https:/cat.com",
+        "chatterino.com-",
+        "<<>>",
+        ">><<",
+        "a.com>><<",
+        "~~a.com()",
+        "https://chatterino.com><https://chatterino.com",
+        "<https://chatterino.com><https://chatterino.com>",
+        // invalid characters are still accepted (see #4769)
+        // "chatterino.com><chatterino.com",
+        // "https://chatterino.com><chatterino.com",
+        // "<chatterino.com><chatterino.com>",
+        // "<https://chatterino.com><chatterino.com>",
    };

    for (const auto &input : inputs)
    {
-        LinkParser p(input);
-        ASSERT_FALSE(p.result().has_value()) << input;
+        auto p = linkparser::parse(input);
+        ASSERT_FALSE(p.has_value()) << input;
    }
 }