Initial rework of matchLink

This commit is contained in:
LajamerrMittesdine 2018-06-27 13:45:54 -04:00 committed by fourtf
parent 6a93aa412e
commit 7fe8ff97e0
4 changed files with 1747 additions and 3 deletions

View file

@ -66,6 +66,7 @@
<file>images/buttons/unban.png</file> <file>images/buttons/unban.png</file>
<file>images/buttons/unmod.png</file> <file>images/buttons/unmod.png</file>
<file>images/emote_dark.svg</file> <file>images/emote_dark.svg</file>
<file>tlds.txt</file>
</qresource> </qresource>
<qresource prefix="/qt/etc"> <qresource prefix="/qt/etc">
<file>qt.conf</file> <file>qt.conf</file>

1693
resources/tlds.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -43,9 +43,44 @@ void MessageBuilder::appendTimestamp(const QTime &time)
QString MessageBuilder::matchLink(const QString &string) QString MessageBuilder::matchLink(const QString &string)
{ {
static QRegularExpression linkRegex("[[:ascii:]]*\\.[a-zA-Z]+\\/?[[:ascii:]]*"); QFile tldFile(":/tlds.txt");
tldFile.open(QFile::ReadOnly);
QTextStream t1(&tldFile);
t1.setCodec("UTF-8");
QString tldData = t1.readAll();
tldData.replace("\n", "|");
const QString urlRegExp = "^"
// protocol identifier
"(?:(?:https?|ftps?)://)?"
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?"
"(?:"
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])"
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}"
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))"
"|"
// host name
"(?:(?:[_a-z\\x{00a1}-\\x{ffff}0-9]-*)*[a-z\\x{00a1}-\\x{ffff}0-9]+)"
// domain name
"(?:\\.(?:[a-z\\x{00a1}-\\x{ffff}0-9]-*)*[a-z\\x{00a1}-\\x{ffff}0-9]+)*"
// TLD identifier
//"(?:\\.(?:[a-z\\x{00a1}-\\x{ffff}]{2,}))"
"(?:[\\.](?:" + tldData + "))"
"\\.?"
")"
// port number
"(?::\\d{2,5})?"
// resource path
"(?:[/?#]\\S*)?"
"$";
static QRegularExpression linkRegex(urlRegExp, QRegularExpression::CaseInsensitiveOption);
static QRegularExpression httpRegex("\\bhttps?://"); static QRegularExpression httpRegex("\\bhttps?://");
static QRegularExpression ftpRegex("\\bftps?://");
auto match = linkRegex.match(string); auto match = linkRegex.match(string);
if (!match.hasMatch()) { if (!match.hasMatch()) {
@ -55,8 +90,10 @@ QString MessageBuilder::matchLink(const QString &string)
QString captured = match.captured(); QString captured = match.captured();
if (!captured.contains(httpRegex)) { if (!captured.contains(httpRegex)) {
if (!captured.contains(ftpRegex)) {
captured.insert(0, "http://"); captured.insert(0, "http://");
} }
}
return captured; return captured;
} }

13
tools/get-tlds-update.sh Normal file
View file

@ -0,0 +1,13 @@
#!/bin/sh
#Download the official list of active TLDs from IANA
#Remove the first line that contains data not needed.
#Put everything that can be into lowercase.
#Output the result to a file.
curl -s 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt' | sed -e '1d' -e 's/\(.*\)/\L\1/' > tlds.txt
#Get the TLDs in punycode format.
#Convert the punycode to Unicode.
#Append the results to the current file.
sed -n -e '/^xn--/p' tlds.txt | idn2 -d >> tlds.txt
mv tlds.txt ../resources/tlds.txt