RFC Compliant URL Matching

Implemented https://gist.github.com/dperini/729294 in C++

This makes URL Validation / Matching RFC compliant.
This commit is contained in:
Lajamerr Mittesdine 2018-06-04 20:51:10 -04:00 committed by GitHub
parent 819812c458
commit f5438ed7a9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -44,8 +44,41 @@ void MessageBuilder::appendTimestamp(const QTime &time)
QString MessageBuilder::matchLink(const QString &string)
{
static QRegularExpression linkRegex("[[:ascii:]]*\\.[a-zA-Z]+\\/?[[:ascii:]]*");
static QRegularExpression httpRegex("\\bhttps?://");
const QString urlRegExp = "^"
// protocol identifier
"(?:(?:https?|ftp)://)"
// user:pass authentication
"(?:\\S+(?::\\S*)?@)?"
"(?:"
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})"
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})"
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})"
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broacast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])"
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}"
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))"
"|"
// host name
"(?:(?:[a-z\\x{00a1}-\\x{ffff}0-9]-*)*[a-z\\x{00a1}-\\x{ffff0}-9]+)"
// domain name
"(?:\\.(?:[a-z\\x{00a1}-\\x{ffff0}-9]-*)*[a-z\\x{00a1}-\\x{ffff0}-9]+)*"
// TLD identifier
"(?:\\.(?:[a-z\\x{00a1}-\\x{ffff}]{2,}))"
"\\.?"
")"
// port number
"(?::\\d{2,5})?"
// resource path
"(?:[/?#]\\S*)?"
"$";
static QRegularExpression linkRegex(urlRegExp, QRegularExpression::CaseInsensitiveOption);
auto match = linkRegex.match(string);
@ -55,10 +88,6 @@ QString MessageBuilder::matchLink(const QString &string)
QString captured = match.captured();
if (!captured.contains(httpRegex)) {
captured.insert(0, "http://");
}
return captured;
}