Test for unicode ranges in alpha numeric characters

2024-11-21 22:24:07 +01:00 · 2023-09-03 10:32:39 -05:00 · 2023-09-03 10:32:39 -05:00 · d6873ad05d
commit d6873ad05d
parent 0b4d1f6f3e
2 changed files with 16 additions and 8 deletions
--- a/src/common/LinkParser.cpp
+++ b/src/common/LinkParser.cpp
@ -9,8 +9,6 @@

 namespace {

-const QString urlAllowedSpecialCharacters = QStringLiteral("!#&+/:=?@-_.");
-
 QSet<QString> &tlds()
 {
    static QSet<QString> tlds = [] {
@ -115,7 +113,7 @@ bool startsWithPort(QStringView string)
    return true;
 }

-// For emoji ranges see: https://unicode.org/charts/
+// For unicode ranges see: https://unicode.org/charts/
 using UnicodeRange = std::pair<ushort, ushort>;
 std::vector<UnicodeRange> emojiRanges = {
    {U'\U00002700', U'\U000027BF' }, // Dingbats
@ -125,9 +123,15 @@ std::vector<UnicodeRange> emojiRanges = {
    {U'\U00001F90', U'\U0001F9FF' }, // Supplemental Symbols and Pictographs
 };

-bool isEmoji(const QChar& ch) {
+std::vector<UnicodeRange> alphaNumeric = {
+    { U'\u0041', U'\u005A' }, // Upper alphabet
+    { U'\u0061', U'\u007A' }, //Lower alphabet
+    { U'\u0030', U'\u0039' }, // Numbers
+};
+
+bool isInUnicodeRange(const QChar& ch, std::vector<UnicodeRange> ranges) {
    ushort unicodeValue = ch.unicode();
-    for (const auto& range : emojiRanges) {
+    for (const auto& range : ranges) {
        if (unicodeValue >= range.first && unicodeValue <= range.second) {
            return true;
        }
@ -135,14 +139,15 @@ bool isEmoji(const QChar& ch) {
    return false;
 }

-
 // Simple sanitization method to strip characters that are not recognized by RFC 3986
 QString sanitizeUrl(const QString &unparsedString)
 {
+    const QString urlAllowedSpecialCharacters = QStringLiteral("!#&+/:=?@-_.");
+
    QString sanitizedUrl;
    for (const QChar &c : unparsedString)
    {
-        if (c.isLetterOrNumber() || isEmoji(c))
+        if (isInUnicodeRange(c, alphaNumeric) || isInUnicodeRange(c, emojiRanges))
        {
            sanitizedUrl.append(c);
            continue;
--- a/tests/src/LinkParser.cpp
+++ b/tests/src/LinkParser.cpp
@ -34,6 +34,7 @@ struct SanitizeCheck {
 TEST(LinkParser, parseDomainLinks)
 {
    const QList<SanitizeCheck> sanitizeCases = {
+        { "TW❘TCH.tv", "TW❘TCH.tv" "" }, // contains dingbat
        {"(twitch.tv/foo)", "twitch.tv", "/foo" },
        {"t🤪w🤪i🤪t🤪c🤪h🤪.tv/foo", "t🤪w🤪i🤪t🤪c🤪h🤪.tv", "/foo" },
        { "https://🏹.to/bar", "🏹.to", "/bar" },
@ -193,7 +194,9 @@ TEST(LinkParser, doesntParseInvalidLinks)
                                "https:/cat.com",
                                "%%%%.com",
                                "*.com",
-                                "t🤪w🤪i🤪t🤪c🤪h🤪.🤪t🤪v/foo"};
+                                "t🤪w🤪i🤪t🤪c🤪h🤪.🤪t🤪v/foo", // Invalid tld
+                                "https։⧸⧸TW❘TCH.tv/a⧸b" // misleading characters: "⧸" and "։"
+                                };

    for (const auto &input : inputs)
    {