1006 lines
29 KiB
C++
1006 lines
29 KiB
C++
// Copyright (C) 2016 The Qt Company Ltd.
|
|
// Copyright (C) 2016 Intel Corporation.
|
|
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
|
|
|
#include "qurl_p.h"
|
|
|
|
#include <QtCore/qstringlist.h>
|
|
#include <QtCore/private/qnumeric_p.h>
|
|
#include <QtCore/private/qoffsetstringarray_p.h>
|
|
#include <QtCore/private/qstringiterator_p.h>
|
|
#include <QtCore/private/qunicodetables_p.h>
|
|
|
|
#include <algorithm>
|
|
|
|
QT_BEGIN_NAMESPACE
|
|
|
|
using namespace Qt::StringLiterals;
|
|
|
|
// needed by the punycode encoder/decoder
|
|
static const uint base = 36;
|
|
static const uint tmin = 1;
|
|
static const uint tmax = 26;
|
|
static const uint skew = 38;
|
|
static const uint damp = 700;
|
|
static const uint initial_bias = 72;
|
|
static const uint initial_n = 128;
|
|
|
|
static constexpr qsizetype MaxDomainLabelLength = 63;
|
|
|
|
static inline uint encodeDigit(uint digit)
|
|
{
|
|
return digit + 22 + 75 * (digit < 26);
|
|
}
|
|
|
|
static inline uint adapt(uint delta, uint numpoints, bool firsttime)
|
|
{
|
|
delta /= (firsttime ? damp : 2);
|
|
delta += (delta / numpoints);
|
|
|
|
uint k = 0;
|
|
for (; delta > ((base - tmin) * tmax) / 2; k += base)
|
|
delta /= (base - tmin);
|
|
|
|
return k + (((base - tmin + 1) * delta) / (delta + skew));
|
|
}
|
|
|
|
static inline void appendEncode(QString *output, uint delta, uint bias)
|
|
{
|
|
uint qq;
|
|
uint k;
|
|
uint t;
|
|
|
|
// insert the variable length delta integer.
|
|
for (qq = delta, k = base;; k += base) {
|
|
// stop generating digits when the threshold is
|
|
// detected.
|
|
t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias;
|
|
if (qq < t) break;
|
|
|
|
*output += QChar(encodeDigit(t + (qq - t) % (base - t)));
|
|
qq = (qq - t) / (base - t);
|
|
}
|
|
|
|
*output += QChar(encodeDigit(qq));
|
|
}
|
|
|
|
Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
|
|
{
|
|
uint n = initial_n;
|
|
uint delta = 0;
|
|
uint bias = initial_bias;
|
|
|
|
// Do not try to encode strings that certainly will result in output
|
|
// that is longer than allowable domain name label length. Note that
|
|
// non-BMP codepoints are encoded as two QChars.
|
|
if (in.size() > MaxDomainLabelLength * 2)
|
|
return;
|
|
|
|
int outLen = output->size();
|
|
output->resize(outLen + in.size());
|
|
|
|
QChar *d = output->data() + outLen;
|
|
bool skipped = false;
|
|
// copy all basic code points verbatim to output.
|
|
for (QChar c : in) {
|
|
if (c.unicode() < 0x80)
|
|
*d++ = c;
|
|
else
|
|
skipped = true;
|
|
}
|
|
|
|
// if there were only basic code points, just return them
|
|
// directly; don't do any encoding.
|
|
if (!skipped)
|
|
return;
|
|
|
|
output->truncate(d - output->constData());
|
|
int copied = output->size() - outLen;
|
|
|
|
// h and b now contain the number of basic code points in input.
|
|
uint b = copied;
|
|
uint h = copied;
|
|
|
|
// if basic code points were copied, add the delimiter character.
|
|
if (h > 0)
|
|
*output += u'-';
|
|
|
|
// compute the input length in Unicode code points.
|
|
uint inputLength = 0;
|
|
for (QStringIterator iter(in); iter.hasNext();) {
|
|
inputLength++;
|
|
|
|
if (iter.next(char32_t(-1)) == char32_t(-1)) {
|
|
output->truncate(outLen);
|
|
return; // invalid surrogate pair
|
|
}
|
|
}
|
|
|
|
// while there are still unprocessed non-basic code points left in
|
|
// the input string...
|
|
while (h < inputLength) {
|
|
// find the character in the input string with the lowest unprocessed value.
|
|
uint m = std::numeric_limits<uint>::max();
|
|
for (QStringIterator iter(in); iter.hasNext();) {
|
|
auto c = iter.nextUnchecked();
|
|
static_assert(std::numeric_limits<decltype(m)>::max()
|
|
>= std::numeric_limits<decltype(c)>::max(),
|
|
"Punycode uint should be able to cover all codepoints");
|
|
if (c >= n && c < m)
|
|
m = c;
|
|
}
|
|
|
|
// delta = delta + (m - n) * (h + 1), fail on overflow
|
|
uint tmp;
|
|
if (qMulOverflow<uint>(m - n, h + 1, &tmp) || qAddOverflow<uint>(delta, tmp, &delta)) {
|
|
output->truncate(outLen);
|
|
return; // punycode_overflow
|
|
}
|
|
n = m;
|
|
|
|
for (QStringIterator iter(in); iter.hasNext();) {
|
|
auto c = iter.nextUnchecked();
|
|
|
|
// increase delta until we reach the character processed in this iteration;
|
|
// fail if delta overflows.
|
|
if (c < n) {
|
|
if (qAddOverflow<uint>(delta, 1, &delta)) {
|
|
output->truncate(outLen);
|
|
return; // punycode_overflow
|
|
}
|
|
}
|
|
|
|
if (c == n) {
|
|
appendEncode(output, delta, bias);
|
|
|
|
bias = adapt(delta, h + 1, h == b);
|
|
delta = 0;
|
|
++h;
|
|
}
|
|
}
|
|
|
|
++delta;
|
|
++n;
|
|
}
|
|
|
|
// prepend ACE prefix
|
|
output->insert(outLen, "xn--"_L1);
|
|
return;
|
|
}
|
|
|
|
Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
|
|
{
|
|
uint n = initial_n;
|
|
uint i = 0;
|
|
uint bias = initial_bias;
|
|
|
|
// Do not try to decode strings longer than allowable for a domain label.
|
|
// Non-ASCII strings are not allowed here anyway, so there is no need
|
|
// to account for surrogates.
|
|
if (pc.size() > MaxDomainLabelLength)
|
|
return QString();
|
|
|
|
// strip any ACE prefix
|
|
int start = pc.startsWith("xn--"_L1) ? 4 : 0;
|
|
if (!start)
|
|
return pc;
|
|
|
|
// find the last delimiter character '-' in the input array. copy
|
|
// all data before this delimiter directly to the output array.
|
|
int delimiterPos = pc.lastIndexOf(u'-');
|
|
auto output = delimiterPos < 4 ? std::u32string()
|
|
: pc.mid(start, delimiterPos - start).toStdU32String();
|
|
|
|
// if a delimiter was found, skip to the position after it;
|
|
// otherwise start at the front of the input string. everything
|
|
// before the delimiter is assumed to be basic code points.
|
|
uint cnt = delimiterPos + 1;
|
|
|
|
// loop through the rest of the input string, inserting non-basic
|
|
// characters into output as we go.
|
|
while (cnt < (uint) pc.size()) {
|
|
uint oldi = i;
|
|
uint w = 1;
|
|
|
|
// find the next index for inserting a non-basic character.
|
|
for (uint k = base; cnt < (uint) pc.size(); k += base) {
|
|
// grab a character from the punycode input and find its
|
|
// delta digit (each digit code is part of the
|
|
// variable-length integer delta)
|
|
uint digit = pc.at(cnt++).unicode();
|
|
if (digit - 48 < 10) digit -= 22;
|
|
else if (digit - 65 < 26) digit -= 65;
|
|
else if (digit - 97 < 26) digit -= 97;
|
|
else digit = base;
|
|
|
|
// Fail if the code point has no digit value
|
|
if (digit >= base)
|
|
return QString();
|
|
|
|
// i = i + digit * w, fail on overflow
|
|
uint tmp;
|
|
if (qMulOverflow<uint>(digit, w, &tmp) || qAddOverflow<uint>(i, tmp, &i))
|
|
return QString();
|
|
|
|
// detect threshold to stop reading delta digits
|
|
uint t;
|
|
if (k <= bias) t = tmin;
|
|
else if (k >= bias + tmax) t = tmax;
|
|
else t = k - bias;
|
|
|
|
if (digit < t) break;
|
|
|
|
// w = w * (base - t), fail on overflow
|
|
if (qMulOverflow<uint>(w, base - t, &w))
|
|
return QString();
|
|
}
|
|
|
|
// find new bias and calculate the next non-basic code
|
|
// character.
|
|
uint outputLength = static_cast<uint>(output.length());
|
|
bias = adapt(i - oldi, outputLength + 1, oldi == 0);
|
|
|
|
// n = n + i div (length(output) + 1), fail on overflow
|
|
if (qAddOverflow<uint>(n, i / (outputLength + 1), &n))
|
|
return QString();
|
|
|
|
// allow the deltas to wrap around
|
|
i %= (outputLength + 1);
|
|
|
|
// if n is a basic code point then fail; this should not happen with
|
|
// correct implementation of Punycode, but check just n case.
|
|
if (n < initial_n) {
|
|
// Don't use Q_ASSERT() to avoid possibility of DoS
|
|
qWarning("Attempt to insert a basic codepoint. Unhandled overflow?");
|
|
return QString();
|
|
}
|
|
|
|
// Surrogates should normally be rejected later by other IDNA code.
|
|
// But because of Qt's use of UTF-16 to represent strings the
|
|
// IDNA code is not able to distinguish characters represented as pairs
|
|
// of surrogates from normal code points. This is why surrogates are
|
|
// not allowed here.
|
|
//
|
|
// Allowing surrogates would lead to non-unique (after normalization)
|
|
// encoding of strings with non-BMP characters.
|
|
//
|
|
// Punycode that encodes characters outside the Unicode range is also
|
|
// invalid and is rejected here.
|
|
if (QChar::isSurrogate(n) || n > QChar::LastValidCodePoint)
|
|
return QString();
|
|
|
|
// insert the character n at position i
|
|
output.insert(i, 1, static_cast<char32_t>(n));
|
|
++i;
|
|
}
|
|
|
|
return QString::fromStdU32String(output);
|
|
}
|
|
|
|
static constexpr auto idn_whitelist = qOffsetStringArray(
|
|
"ac", "ar", "asia", "at",
|
|
"biz", "br",
|
|
"cat", "ch", "cl", "cn", "com",
|
|
"de", "dk",
|
|
"es",
|
|
"fi",
|
|
"gr",
|
|
"hu",
|
|
"il", "info", "io", "ir", "is",
|
|
"jp",
|
|
"kr",
|
|
"li", "lt", "lu", "lv",
|
|
"museum",
|
|
"name", "net", "no", "nu", "nz",
|
|
"org",
|
|
"pl", "pr",
|
|
"se", "sh",
|
|
"tel", "th", "tm", "tw",
|
|
"ua",
|
|
"vn",
|
|
"xn--fiqs8s", // China
|
|
"xn--fiqz9s", // China
|
|
"xn--fzc2c9e2c", // Sri Lanka
|
|
"xn--j6w193g", // Hong Kong
|
|
"xn--kprw13d", // Taiwan
|
|
"xn--kpry57d", // Taiwan
|
|
"xn--mgba3a4f16a", // Iran
|
|
"xn--mgba3a4fra", // Iran
|
|
"xn--mgbaam7a8h", // UAE
|
|
"xn--mgbayh7gpa", // Jordan
|
|
"xn--mgberp4a5d4ar", // Saudi Arabia
|
|
"xn--ogbpf8fl", // Syria
|
|
"xn--p1ai", // Russian Federation
|
|
"xn--wgbh1c", // Egypt
|
|
"xn--wgbl6a", // Qatar
|
|
"xn--xkc2al3hye2a" // Sri Lanka
|
|
);
|
|
|
|
Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr;
|
|
|
|
static bool lessThan(const QChar *a, int l, const char *c)
|
|
{
|
|
const auto *uc = reinterpret_cast<const char16_t *>(a);
|
|
const char16_t *e = uc + l;
|
|
|
|
if (!c || *c == 0)
|
|
return false;
|
|
|
|
while (*c) {
|
|
if (uc == e || *uc != static_cast<unsigned char>(*c))
|
|
break;
|
|
++uc;
|
|
++c;
|
|
}
|
|
return uc == e ? *c : (*uc < static_cast<unsigned char>(*c));
|
|
}
|
|
|
|
static bool equal(const QChar *a, int l, const char *b)
|
|
{
|
|
while (l && a->unicode() && *b) {
|
|
if (*a != QLatin1Char(*b))
|
|
return false;
|
|
++a;
|
|
++b;
|
|
--l;
|
|
}
|
|
return l == 0;
|
|
}
|
|
|
|
static bool qt_is_idn_enabled(QStringView aceDomain)
|
|
{
|
|
auto idx = aceDomain.lastIndexOf(u'.');
|
|
if (idx == -1)
|
|
return false;
|
|
|
|
auto tldString = aceDomain.mid(idx + 1);
|
|
const auto len = tldString.size();
|
|
|
|
const QChar *tld = tldString.constData();
|
|
|
|
if (user_idn_whitelist)
|
|
return user_idn_whitelist->contains(tldString);
|
|
|
|
int l = 0;
|
|
int r = idn_whitelist.count() - 1;
|
|
int i = (l + r + 1) / 2;
|
|
|
|
while (r != l) {
|
|
if (lessThan(tld, len, idn_whitelist.at(i)))
|
|
r = i - 1;
|
|
else
|
|
l = i;
|
|
i = (l + r + 1) / 2;
|
|
}
|
|
return equal(tld, len, idn_whitelist.at(i));
|
|
}
|
|
|
|
template<typename C>
|
|
static inline bool isValidInNormalizedAsciiLabel(C c)
|
|
{
|
|
return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z');
|
|
}
|
|
|
|
template<typename C>
|
|
static inline bool isValidInNormalizedAsciiName(C c)
|
|
{
|
|
return isValidInNormalizedAsciiLabel(c) || c == u'.';
|
|
}
|
|
|
|
/*
|
|
Map domain name according to algorithm in UTS #46, 4.1
|
|
|
|
Returns empty string if there are disallowed characters in the input.
|
|
|
|
Sets resultIsAscii if the result is known for sure to be all ASCII.
|
|
*/
|
|
static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options,
|
|
bool *resultIsAscii)
|
|
{
|
|
*resultIsAscii = true;
|
|
|
|
// Check if the input is already normalized ASCII first and can be returned as is.
|
|
int i = 0;
|
|
for (auto c : in) {
|
|
if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c))
|
|
break;
|
|
i++;
|
|
}
|
|
|
|
if (i == in.size())
|
|
return in;
|
|
|
|
QString result;
|
|
result.reserve(in.size());
|
|
result.append(in.constData(), i);
|
|
bool allAscii = true;
|
|
|
|
for (QStringIterator iter(QStringView(in).sliced(i)); iter.hasNext();) {
|
|
char32_t uc = iter.next();
|
|
|
|
// Fast path for ASCII-only inputs
|
|
if (Q_LIKELY(uc < 0x80)) {
|
|
if (uc >= U'A' && uc <= U'Z')
|
|
uc |= 0x20; // lower-case it
|
|
|
|
if (isValidInNormalizedAsciiName(uc)) {
|
|
result.append(static_cast<char16_t>(uc));
|
|
continue;
|
|
}
|
|
}
|
|
|
|
allAscii = false;
|
|
|
|
// Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1)
|
|
if (uc == 0x1E9E && options.testFlag(QUrl::AceTransitionalProcessing)) {
|
|
result.append(u"ss"_s);
|
|
continue;
|
|
}
|
|
|
|
QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(uc);
|
|
|
|
if (status == QUnicodeTables::IdnaStatus::Deviation)
|
|
status = options.testFlag(QUrl::AceTransitionalProcessing)
|
|
? QUnicodeTables::IdnaStatus::Mapped
|
|
: QUnicodeTables::IdnaStatus::Valid;
|
|
|
|
switch (status) {
|
|
case QUnicodeTables::IdnaStatus::Ignored:
|
|
continue;
|
|
case QUnicodeTables::IdnaStatus::Valid:
|
|
case QUnicodeTables::IdnaStatus::Disallowed:
|
|
for (auto c : QChar::fromUcs4(uc))
|
|
result.append(c);
|
|
break;
|
|
case QUnicodeTables::IdnaStatus::Mapped:
|
|
result.append(QUnicodeTables::idnaMapping(uc));
|
|
break;
|
|
default:
|
|
Q_UNREACHABLE();
|
|
}
|
|
}
|
|
|
|
*resultIsAscii = allAscii;
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
Check the rules for an ASCII label.
|
|
|
|
Check the size restriction and that the label does not start or end with dashes.
|
|
|
|
The label should be nonempty.
|
|
*/
|
|
static bool validateAsciiLabel(QStringView label)
|
|
{
|
|
if (label.size() > MaxDomainLabelLength)
|
|
return false;
|
|
|
|
if (label.first() == u'-' || label.last() == u'-')
|
|
return false;
|
|
|
|
return std::all_of(label.begin(), label.end(), isValidInNormalizedAsciiLabel<QChar>);
|
|
}
|
|
|
|
namespace {
|
|
|
|
class DomainValidityChecker
|
|
{
|
|
bool domainNameIsBidi = false;
|
|
bool hadBidiErrors = false;
|
|
bool ignoreBidiErrors;
|
|
|
|
static constexpr char32_t ZWNJ = U'\u200C';
|
|
static constexpr char32_t ZWJ = U'\u200D';
|
|
|
|
public:
|
|
DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { }
|
|
bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
|
|
|
|
private:
|
|
static bool checkContextJRules(QStringView label);
|
|
static bool checkBidiRules(QStringView label);
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
/*
|
|
Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2.
|
|
|
|
Rule Set for U+200C (ZWNJ):
|
|
|
|
False;
|
|
|
|
If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
|
|
|
|
If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
|
|
|
|
(Joining_Type:T)*(Joining_Type:{R,D})) Then True;
|
|
|
|
Rule Set for U+200D (ZWJ):
|
|
|
|
False;
|
|
|
|
If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
|
|
|
|
*/
|
|
bool DomainValidityChecker::checkContextJRules(QStringView label)
|
|
{
|
|
constexpr unsigned char CombiningClassVirama = 9;
|
|
|
|
enum class State {
|
|
Initial,
|
|
LD_T, // L,D with possible following T*
|
|
ZWNJ_T, // ZWNJ with possible following T*
|
|
};
|
|
State regexpState = State::Initial;
|
|
bool previousIsVirama = false;
|
|
|
|
for (QStringIterator iter(label); iter.hasNext();) {
|
|
auto ch = iter.next();
|
|
|
|
if (ch == ZWJ) {
|
|
if (!previousIsVirama)
|
|
return false;
|
|
regexpState = State::Initial;
|
|
} else if (ch == ZWNJ) {
|
|
if (!previousIsVirama && regexpState != State::LD_T)
|
|
return false;
|
|
regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T;
|
|
} else {
|
|
switch (QChar::joiningType(ch)) {
|
|
case QChar::Joining_Left:
|
|
if (regexpState == State::ZWNJ_T)
|
|
return false;
|
|
regexpState = State::LD_T;
|
|
break;
|
|
case QChar::Joining_Right:
|
|
regexpState = State::Initial;
|
|
break;
|
|
case QChar::Joining_Dual:
|
|
regexpState = State::LD_T;
|
|
break;
|
|
case QChar::Joining_Transparent:
|
|
break;
|
|
default:
|
|
regexpState = State::Initial;
|
|
break;
|
|
}
|
|
}
|
|
|
|
previousIsVirama = QChar::combiningClass(ch) == CombiningClassVirama;
|
|
}
|
|
|
|
return regexpState != State::ZWNJ_T;
|
|
}
|
|
|
|
/*
|
|
Check if the label conforms to BiDi rule of RFC 5893.
|
|
|
|
1. The first character must be a character with Bidi property L, R,
|
|
or AL. If it has the R or AL property, it is an RTL label; if it
|
|
has the L property, it is an LTR label.
|
|
|
|
2. In an RTL label, only characters with the Bidi properties R, AL,
|
|
AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
|
|
|
3. In an RTL label, the end of the label must be a character with
|
|
Bidi property R, AL, EN, or AN, followed by zero or more
|
|
characters with Bidi property NSM.
|
|
|
|
4. In an RTL label, if an EN is present, no AN may be present, and
|
|
vice versa.
|
|
|
|
5. In an LTR label, only characters with the Bidi properties L, EN,
|
|
ES, CS, ET, ON, BN, or NSM are allowed.
|
|
|
|
6. In an LTR label, the end of the label must be a character with
|
|
Bidi property L or EN, followed by zero or more characters with
|
|
Bidi property NSM.
|
|
*/
|
|
bool DomainValidityChecker::checkBidiRules(QStringView label)
|
|
{
|
|
if (label.isEmpty())
|
|
return true;
|
|
|
|
QStringIterator iter(label);
|
|
Q_ASSERT(iter.hasNext());
|
|
|
|
char32_t ch = iter.next();
|
|
bool labelIsRTL = false;
|
|
|
|
switch (QChar::direction(ch)) {
|
|
case QChar::DirL:
|
|
break;
|
|
case QChar::DirR:
|
|
case QChar::DirAL:
|
|
labelIsRTL = true;
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
bool tailOk = true;
|
|
bool labelHasEN = false;
|
|
bool labelHasAN = false;
|
|
|
|
while (iter.hasNext()) {
|
|
ch = iter.next();
|
|
|
|
switch (QChar::direction(ch)) {
|
|
case QChar::DirR:
|
|
case QChar::DirAL:
|
|
if (!labelIsRTL)
|
|
return false;
|
|
tailOk = true;
|
|
break;
|
|
|
|
case QChar::DirL:
|
|
if (labelIsRTL)
|
|
return false;
|
|
tailOk = true;
|
|
break;
|
|
|
|
case QChar::DirES:
|
|
case QChar::DirCS:
|
|
case QChar::DirET:
|
|
case QChar::DirON:
|
|
case QChar::DirBN:
|
|
tailOk = false;
|
|
break;
|
|
|
|
case QChar::DirNSM:
|
|
break;
|
|
|
|
case QChar::DirAN:
|
|
if (labelIsRTL) {
|
|
if (labelHasEN)
|
|
return false;
|
|
labelHasAN = true;
|
|
tailOk = true;
|
|
} else {
|
|
return false;
|
|
}
|
|
break;
|
|
|
|
case QChar::DirEN:
|
|
if (labelIsRTL) {
|
|
if (labelHasAN)
|
|
return false;
|
|
labelHasEN = true;
|
|
}
|
|
tailOk = true;
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return tailOk;
|
|
}
|
|
|
|
/*
|
|
Check if the given label is valid according to UTS #46 validity criteria.
|
|
|
|
NFC check can be skipped if the label was transformed to NFC before calling
|
|
this function (as optimization).
|
|
|
|
The domain name is considered invalid if this function returns false at least
|
|
once.
|
|
|
|
1. The label must be in Unicode Normalization Form NFC.
|
|
2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
|
|
in both the third and fourth positions.
|
|
3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character.
|
|
4. The label must not contain a U+002E ( . ) FULL STOP.
|
|
5. The label must not begin with a combining mark, that is: General_Category=Mark.
|
|
6. Each code point in the label must only have certain status values according to Section 5,
|
|
IDNA Mapping Table:
|
|
1. For Transitional Processing, each value must be valid.
|
|
2. For Nontransitional Processing, each value must be either valid or deviation.
|
|
7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode
|
|
Code Points and Internationalized Domain Names for Applications (IDNA).
|
|
8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy
|
|
all six of the numbered conditions in RFC 5893, Section 2.
|
|
|
|
NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid
|
|
memory allocation when there is nothing to normalize.
|
|
*/
|
|
bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options)
|
|
{
|
|
if (label.isEmpty())
|
|
return true;
|
|
|
|
if (label != label.normalized(QString::NormalizationForm_C))
|
|
return false;
|
|
|
|
if (label.size() >= 4) {
|
|
// This assumes that the first two characters are in BMP, but that's ok
|
|
// because non-BMP characters are unlikely to be used for specifying
|
|
// future extensions.
|
|
if (label[2] == u'-' && label[3] == u'-')
|
|
return ignoreBidiErrors && label.startsWith(u"xn") && validateAsciiLabel(label);
|
|
}
|
|
|
|
if (label.startsWith(u'-') || label.endsWith(u'-'))
|
|
return false;
|
|
|
|
if (label.contains(u'.'))
|
|
return false;
|
|
|
|
QStringIterator iter(label);
|
|
auto c = iter.next();
|
|
|
|
if (QChar::isMark(c))
|
|
return false;
|
|
|
|
// As optimization, CONTEXTJ rules check can be skipped if no
|
|
// ZWJ/ZWNJ characters were found during the first pass.
|
|
bool hasJoiners = false;
|
|
|
|
for (;;) {
|
|
hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ;
|
|
|
|
if (!ignoreBidiErrors && !domainNameIsBidi) {
|
|
switch (QChar::direction(c)) {
|
|
case QChar::DirR:
|
|
case QChar::DirAL:
|
|
case QChar::DirAN:
|
|
domainNameIsBidi = true;
|
|
if (hadBidiErrors)
|
|
return false;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (QUnicodeTables::idnaStatus(c)) {
|
|
case QUnicodeTables::IdnaStatus::Valid:
|
|
break;
|
|
case QUnicodeTables::IdnaStatus::Deviation:
|
|
if (options.testFlag(QUrl::AceTransitionalProcessing))
|
|
return false;
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
if (!iter.hasNext())
|
|
break;
|
|
c = iter.next();
|
|
}
|
|
|
|
if (hasJoiners && !checkContextJRules(label))
|
|
return false;
|
|
|
|
hadBidiErrors = hadBidiErrors || !checkBidiRules(label);
|
|
|
|
if (domainNameIsBidi && hadBidiErrors)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
|
|
{
|
|
qsizetype lastIdx = 0;
|
|
QString aceForm; // this variable is here for caching
|
|
QString aceResult;
|
|
|
|
while (true) {
|
|
qsizetype idx = normalizedDomain.indexOf(u'.', lastIdx);
|
|
if (idx == -1)
|
|
idx = normalizedDomain.size();
|
|
|
|
const qsizetype labelLength = idx - lastIdx;
|
|
if (labelLength) {
|
|
const auto label = normalizedDomain.sliced(lastIdx, labelLength);
|
|
aceForm.clear();
|
|
qt_punycodeEncoder(label, &aceForm);
|
|
if (aceForm.isEmpty())
|
|
return {};
|
|
|
|
aceResult.append(aceForm);
|
|
}
|
|
|
|
if (idx == normalizedDomain.size())
|
|
break;
|
|
|
|
if (labelLength == 0 && (dot == ForbidLeadingDot || idx > 0))
|
|
return {}; // two delimiters in a row -- empty label not allowed
|
|
|
|
lastIdx = idx + 1;
|
|
aceResult += u'.';
|
|
}
|
|
|
|
return aceResult;
|
|
}
|
|
|
|
static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot,
|
|
bool *usesPunycode)
|
|
{
|
|
qsizetype lastIdx = 0;
|
|
bool hasPunycode = false;
|
|
*usesPunycode = false;
|
|
|
|
while (lastIdx < normalizedDomain.size()) {
|
|
auto idx = normalizedDomain.indexOf(u'.', lastIdx);
|
|
if (idx == -1)
|
|
idx = normalizedDomain.size();
|
|
|
|
const auto labelLength = idx - lastIdx;
|
|
if (labelLength == 0) {
|
|
if (idx == normalizedDomain.size())
|
|
break;
|
|
if (dot == ForbidLeadingDot || idx > 0)
|
|
return false; // two delimiters in a row -- empty label not allowed
|
|
} else {
|
|
const auto label = normalizedDomain.sliced(lastIdx, labelLength);
|
|
if (!validateAsciiLabel(label))
|
|
return false;
|
|
|
|
hasPunycode = hasPunycode || label.startsWith("xn--"_L1);
|
|
}
|
|
|
|
lastIdx = idx + 1;
|
|
}
|
|
|
|
*usesPunycode = hasPunycode;
|
|
return true;
|
|
}
|
|
|
|
static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options)
|
|
{
|
|
QString result;
|
|
result.reserve(asciiDomain.size());
|
|
qsizetype lastIdx = 0;
|
|
|
|
DomainValidityChecker checker;
|
|
|
|
while (true) {
|
|
auto idx = asciiDomain.indexOf(u'.', lastIdx);
|
|
if (idx == -1)
|
|
idx = asciiDomain.size();
|
|
|
|
const auto labelLength = idx - lastIdx;
|
|
if (labelLength == 0) {
|
|
if (idx == asciiDomain.size())
|
|
break;
|
|
} else {
|
|
const auto label = asciiDomain.sliced(lastIdx, labelLength);
|
|
const auto unicodeLabel = qt_punycodeDecoder(label);
|
|
|
|
if (unicodeLabel.isEmpty())
|
|
return asciiDomain;
|
|
|
|
if (!checker.checkLabel(unicodeLabel, options))
|
|
return asciiDomain;
|
|
|
|
result.append(unicodeLabel);
|
|
}
|
|
|
|
if (idx == asciiDomain.size())
|
|
break;
|
|
|
|
lastIdx = idx + 1;
|
|
result += u'.';
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options)
|
|
{
|
|
qsizetype lastIdx = 0;
|
|
|
|
DomainValidityChecker checker(true);
|
|
|
|
while (true) {
|
|
qsizetype idx = domainName.indexOf(u'.', lastIdx);
|
|
if (idx == -1)
|
|
idx = domainName.size();
|
|
|
|
const qsizetype labelLength = idx - lastIdx;
|
|
if (labelLength) {
|
|
const auto label = domainName.sliced(lastIdx, labelLength);
|
|
|
|
if (!checker.checkLabel(label, options))
|
|
return false;
|
|
}
|
|
|
|
if (idx == domainName.size())
|
|
break;
|
|
|
|
lastIdx = idx + 1;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
|
|
QUrl::AceProcessingOptions options)
|
|
{
|
|
if (domain.isEmpty())
|
|
return {};
|
|
|
|
bool mappedToAscii;
|
|
const QString mapped = mapDomainName(domain, options, &mappedToAscii);
|
|
const QString normalized =
|
|
mappedToAscii ? mapped : mapped.normalized(QString::NormalizationForm_C);
|
|
|
|
if (normalized.isEmpty())
|
|
return {};
|
|
|
|
if (!mappedToAscii && !checkUnicodeName(normalized, options))
|
|
return {};
|
|
|
|
bool needsConversionToUnicode;
|
|
const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalized, dot);
|
|
if (aceResult.isEmpty() || !checkAsciiDomainName(aceResult, dot, &needsConversionToUnicode))
|
|
return {};
|
|
|
|
if (op == ToAceOnly || !needsConversionToUnicode
|
|
|| (!options.testFlag(QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceResult))) {
|
|
return aceResult;
|
|
}
|
|
|
|
return convertToUnicode(aceResult, options);
|
|
}
|
|
|
|
/*!
|
|
\since 4.2
|
|
|
|
Returns the current whitelist of top-level domains that are allowed
|
|
to have non-ASCII characters in their compositions.
|
|
|
|
See setIdnWhitelist() for the rationale of this list.
|
|
|
|
\sa AceProcessingOption
|
|
*/
|
|
QStringList QUrl::idnWhitelist()
|
|
{
|
|
if (user_idn_whitelist)
|
|
return *user_idn_whitelist;
|
|
static const QStringList list = [] {
|
|
QStringList list;
|
|
list.reserve(idn_whitelist.count());
|
|
int i = 0;
|
|
while (i < idn_whitelist.count()) {
|
|
list << QLatin1StringView(idn_whitelist.at(i));
|
|
++i;
|
|
}
|
|
return list;
|
|
}();
|
|
return list;
|
|
}
|
|
|
|
/*!
|
|
\since 4.2
|
|
|
|
Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have
|
|
non-ASCII characters in domains to the value of \a list.
|
|
|
|
Note that if you call this function, you need to do so \e before
|
|
you start any threads that might access idnWhitelist().
|
|
|
|
Qt comes with a default list that contains the Internet top-level domains
|
|
that have published support for Internationalized Domain Names (IDNs)
|
|
and rules to guarantee that no deception can happen between similarly-looking
|
|
characters (such as the Latin lowercase letter \c 'a' and the Cyrillic
|
|
equivalent, which in most fonts are visually identical).
|
|
|
|
This list is periodically maintained, as registrars publish new rules.
|
|
|
|
This function is provided for those who need to manipulate the list, in
|
|
order to add or remove a TLD. It is not recommended to change its value
|
|
for purposes other than testing, as it may expose users to security risks.
|
|
*/
|
|
void QUrl::setIdnWhitelist(const QStringList &list)
|
|
{
|
|
if (!user_idn_whitelist)
|
|
user_idn_whitelist = new QStringList;
|
|
*user_idn_whitelist = list;
|
|
}
|
|
|
|
QT_END_NAMESPACE
|