mirror of
https://github.com/Chatterino/chatterino2.git
synced 2024-11-21 22:24:07 +01:00
Test for unicode ranges in alpha numeric characters
This commit is contained in:
parent
0b4d1f6f3e
commit
d6873ad05d
2 changed files with 16 additions and 8 deletions
|
@ -9,8 +9,6 @@
|
|||
|
||||
namespace {
|
||||
|
||||
const QString urlAllowedSpecialCharacters = QStringLiteral("!#&+/:=?@-_.");
|
||||
|
||||
QSet<QString> &tlds()
|
||||
{
|
||||
static QSet<QString> tlds = [] {
|
||||
|
@ -115,7 +113,7 @@ bool startsWithPort(QStringView string)
|
|||
return true;
|
||||
}
|
||||
|
||||
// For emoji ranges see: https://unicode.org/charts/
|
||||
// For unicode ranges see: https://unicode.org/charts/
|
||||
using UnicodeRange = std::pair<ushort, ushort>;
|
||||
std::vector<UnicodeRange> emojiRanges = {
|
||||
{U'\U00002700', U'\U000027BF' }, // Dingbats
|
||||
|
@ -125,9 +123,15 @@ std::vector<UnicodeRange> emojiRanges = {
|
|||
{U'\U00001F90', U'\U0001F9FF' }, // Supplemental Symbols and Pictographs
|
||||
};
|
||||
|
||||
bool isEmoji(const QChar& ch) {
|
||||
std::vector<UnicodeRange> alphaNumeric = {
|
||||
{ U'\u0041', U'\u005A' }, // Upper alphabet
|
||||
{ U'\u0061', U'\u007A' }, //Lower alphabet
|
||||
{ U'\u0030', U'\u0039' }, // Numbers
|
||||
};
|
||||
|
||||
bool isInUnicodeRange(const QChar& ch, std::vector<UnicodeRange> ranges) {
|
||||
ushort unicodeValue = ch.unicode();
|
||||
for (const auto& range : emojiRanges) {
|
||||
for (const auto& range : ranges) {
|
||||
if (unicodeValue >= range.first && unicodeValue <= range.second) {
|
||||
return true;
|
||||
}
|
||||
|
@ -135,14 +139,15 @@ bool isEmoji(const QChar& ch) {
|
|||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Simple sanitization method to strip characters that are not recognized by RFC 3986
|
||||
QString sanitizeUrl(const QString &unparsedString)
|
||||
{
|
||||
const QString urlAllowedSpecialCharacters = QStringLiteral("!#&+/:=?@-_.");
|
||||
|
||||
QString sanitizedUrl;
|
||||
for (const QChar &c : unparsedString)
|
||||
{
|
||||
if (c.isLetterOrNumber() || isEmoji(c))
|
||||
if (isInUnicodeRange(c, alphaNumeric) || isInUnicodeRange(c, emojiRanges))
|
||||
{
|
||||
sanitizedUrl.append(c);
|
||||
continue;
|
||||
|
|
|
@ -34,6 +34,7 @@ struct SanitizeCheck {
|
|||
TEST(LinkParser, parseDomainLinks)
|
||||
{
|
||||
const QList<SanitizeCheck> sanitizeCases = {
|
||||
{ "TW❘TCH.tv", "TW❘TCH.tv" "" }, // contains dingbat
|
||||
{"(twitch.tv/foo)", "twitch.tv", "/foo" },
|
||||
{"t🤪w🤪i🤪t🤪c🤪h🤪.tv/foo", "t🤪w🤪i🤪t🤪c🤪h🤪.tv", "/foo" },
|
||||
{ "https://🏹.to/bar", "🏹.to", "/bar" },
|
||||
|
@ -193,7 +194,9 @@ TEST(LinkParser, doesntParseInvalidLinks)
|
|||
"https:/cat.com",
|
||||
"%%%%.com",
|
||||
"*.com",
|
||||
"t🤪w🤪i🤪t🤪c🤪h🤪.🤪t🤪v/foo"};
|
||||
"t🤪w🤪i🤪t🤪c🤪h🤪.🤪t🤪v/foo", // Invalid tld
|
||||
"https։⧸⧸TW❘TCH.tv/a⧸b" // misleading characters: "⧸" and "։"
|
||||
};
|
||||
|
||||
for (const auto &input : inputs)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue