Test for unicode ranges in alpha numeric characters

This commit is contained in:
Colton Clemmer 2023-09-03 10:32:39 -05:00
parent 0b4d1f6f3e
commit d6873ad05d
2 changed files with 16 additions and 8 deletions

View file

@ -9,8 +9,6 @@
namespace {
const QString urlAllowedSpecialCharacters = QStringLiteral("!#&+/:=?@-_.");
QSet<QString> &tlds()
{
static QSet<QString> tlds = [] {
@ -115,7 +113,7 @@ bool startsWithPort(QStringView string)
return true;
}
// For emoji ranges see: https://unicode.org/charts/
// For unicode ranges see: https://unicode.org/charts/
using UnicodeRange = std::pair<ushort, ushort>;
std::vector<UnicodeRange> emojiRanges = {
{U'\U00002700', U'\U000027BF' }, // Dingbats
@ -125,9 +123,15 @@ std::vector<UnicodeRange> emojiRanges = {
{U'\U00001F90', U'\U0001F9FF' }, // Supplemental Symbols and Pictographs
};
bool isEmoji(const QChar& ch) {
std::vector<UnicodeRange> alphaNumeric = {
{ U'\u0041', U'\u005A' }, // Upper alphabet
{ U'\u0061', U'\u007A' }, //Lower alphabet
{ U'\u0030', U'\u0039' }, // Numbers
};
bool isInUnicodeRange(const QChar& ch, std::vector<UnicodeRange> ranges) {
ushort unicodeValue = ch.unicode();
for (const auto& range : emojiRanges) {
for (const auto& range : ranges) {
if (unicodeValue >= range.first && unicodeValue <= range.second) {
return true;
}
@ -135,14 +139,15 @@ bool isEmoji(const QChar& ch) {
return false;
}
// Simple sanitization method to strip characters that are not recognized by RFC 3986
QString sanitizeUrl(const QString &unparsedString)
{
const QString urlAllowedSpecialCharacters = QStringLiteral("!#&+/:=?@-_.");
QString sanitizedUrl;
for (const QChar &c : unparsedString)
{
if (c.isLetterOrNumber() || isEmoji(c))
if (isInUnicodeRange(c, alphaNumeric) || isInUnicodeRange(c, emojiRanges))
{
sanitizedUrl.append(c);
continue;

View file

@ -34,6 +34,7 @@ struct SanitizeCheck {
TEST(LinkParser, parseDomainLinks)
{
const QList<SanitizeCheck> sanitizeCases = {
{ "TW❘TCH.tv", "TW❘TCH.tv" "" }, // contains dingbat
{"(twitch.tv/foo)", "twitch.tv", "/foo" },
{"t🤪w🤪i🤪t🤪c🤪h🤪.tv/foo", "t🤪w🤪i🤪t🤪c🤪h🤪.tv", "/foo" },
{ "https://🏹.to/bar", "🏹.to", "/bar" },
@ -193,7 +194,9 @@ TEST(LinkParser, doesntParseInvalidLinks)
"https:/cat.com",
"%%%%.com",
"*.com",
"t🤪w🤪i🤪t🤪c🤪h🤪.🤪t🤪v/foo"};
"t🤪w🤪i🤪t🤪c🤪h🤪.🤪t🤪v/foo", // Invalid tld
"https։TW❘TCH.tv/ab" // misleading characters: "" and "։"
};
for (const auto &input : inputs)
{