refactor: move zero width replacement to a function (#5594)

This commit is contained in:
nerix 2024-09-28 14:49:26 +02:00 committed by GitHub
parent e149be3820
commit d0bcf35fdc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 89 additions and 46 deletions

View file

@ -90,6 +90,7 @@
- Dev: The timer for `StreamerMode` is now destroyed on the correct thread. (#5571)
- Dev: Cleanup some parts of the `magic_enum` adaptation for Qt. (#5587)
- Dev: Refactored `static`s in headers to only be present once in the final app. (#5588)
- Dev: Refactored legacy Unicode zero-width-joiner replacement. (#5594)
- Dev: The JSON output when copying a message (<kbd>SHIFT</kbd> + right-click) is now more extensive. (#5600)
## 2.5.1

View file

@ -1,22 +1,13 @@
#include "providers/recentmessages/Impl.hpp"
#include "common/Env.hpp"
#include "common/QLogging.hpp"
#include "messages/MessageBuilder.hpp"
#include "providers/twitch/IrcMessageHandler.hpp"
#include "providers/twitch/TwitchChannel.hpp"
#include "util/FormatTime.hpp"
#include "util/Helpers.hpp"
#include <QJsonArray>
#include <QUrlQuery>
namespace {
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
const auto &LOG = chatterinoRecentMessages;
} // namespace
namespace chatterino::recentmessages::detail {
// Parse the IRC messages returned in JSON form into Communi messages
@ -33,11 +24,7 @@ std::vector<Communi::IrcMessage *> parseRecentMessages(
for (const auto &jsonMessage : jsonMessages)
{
auto content = jsonMessage.toString();
// For explanation of why this exists, see src/providers/twitch/TwitchChannel.hpp,
// where these constants are defined
content.replace(COMBINED_FIXER, ZERO_WIDTH_JOINER);
auto content = unescapeZeroWidthJoiner(jsonMessage.toString());
auto *message =
Communi::IrcMessage::fromData(content.toUtf8(), nullptr);

View file

@ -702,15 +702,8 @@ void IrcMessageHandler::handlePrivMessage(Communi::IrcPrivateMessage *message,
}
}
// This is for compatibility with older Chatterino versions. Twitch didn't use
// to allow ZERO WIDTH JOINER unicode character, so Chatterino used ESCAPE_TAG
// instead.
// See https://github.com/Chatterino/chatterino2/issues/3384 and
// https://mm2pl.github.io/emoji_rfc.pdf for more details
this->addMessage(
message, chan,
message->content().replace(COMBINED_FIXER, ZERO_WIDTH_JOINER),
twitchServer, false, message->isAction());
this->addMessage(message, chan, unescapeZeroWidthJoiner(message->content()),
twitchServer, false, message->isAction());
if (message->tags().contains(u"pinned-chat-paid-amount"_s))
{
@ -915,10 +908,9 @@ void IrcMessageHandler::handleWhisperMessage(Communi::IrcMessage *ircMessage)
auto *c = getApp()->getTwitch()->getWhispersChannel().get();
MessageBuilder builder(
c, ircMessage, args,
ircMessage->parameter(1).replace(COMBINED_FIXER, ZERO_WIDTH_JOINER),
false);
MessageBuilder builder(c, ircMessage, args,
unescapeZeroWidthJoiner(ircMessage->parameter(1)),
false);
if (builder.isIgnored())
{

View file

@ -27,24 +27,6 @@
namespace chatterino {
// This is for compatibility with older Chatterino versions. Twitch didn't use
// to allow ZERO WIDTH JOINER unicode character, so Chatterino used ESCAPE_TAG
// instead.
// See https://github.com/Chatterino/chatterino2/issues/3384 and
// https://mm2pl.github.io/emoji_rfc.pdf for more details
const QString ZERO_WIDTH_JOINER = QString(QChar(0x200D));
// Here be MSVC: Do NOT replace with "\U" literal, it will fail silently.
namespace {
const QChar ESCAPE_TAG_CHARS[2] = {QChar::highSurrogate(0xE0002),
QChar::lowSurrogate(0xE0002)};
}
const QString ESCAPE_TAG = QString(ESCAPE_TAG_CHARS, 2);
const static QRegularExpression COMBINED_FIXER(
QString("(?<!%1)%1").arg(ESCAPE_TAG),
QRegularExpression::UseUnicodePropertiesOption);
enum class HighlightState;
struct Emote;

View file

@ -7,6 +7,18 @@
#include <QRegularExpression>
#include <QUuid>
namespace {
const QString ZERO_WIDTH_JOINER = QStringLiteral("\u200D");
// Note: \U requires /utf-8 for MSVC
// See https://mm2pl.github.io/emoji_rfc.pdf
const QRegularExpression ESCAPE_TAG_REGEX(
QStringLiteral("(?<!\U000E0002)\U000E0002"),
QRegularExpression::UseUnicodePropertiesOption);
} // namespace
namespace chatterino {
namespace helpers::detail {
@ -283,4 +295,10 @@ bool compareEmoteStrings(const QString &a, const QString &b)
return k < 0;
}
QString unescapeZeroWidthJoiner(QString escaped)
{
escaped.replace(ESCAPE_TAG_REGEX, ZERO_WIDTH_JOINER);
return escaped;
}
} // namespace chatterino

View file

@ -182,4 +182,11 @@ constexpr std::optional<std::decay_t<T>> makeConditionedOptional(bool condition,
return std::nullopt;
}
/// @brief Unescapes zero width joiners (ZWJ; U+200D) from Twitch messages
///
/// Older Chatterino versions escape ZWJ with an ESCAPE TAG (U+E0002), following
/// https://mm2pl.github.io/emoji_rfc.pdf. This function unescapes all tags with
/// a ZWJ. See also: https://github.com/Chatterino/chatterino2/issues/3384.
QString unescapeZeroWidthJoiner(QString escaped);
} // namespace chatterino

View file

@ -2,6 +2,8 @@
#include "Test.hpp"
#include <span>
using namespace chatterino;
using namespace helpers::detail;
@ -500,3 +502,57 @@ TEST(Helpers, parseDurationToSeconds)
<< c.output;
}
}
TEST(Helpers, unescapeZeroWidthJoiner)
{
struct TestCase {
QStringView input;
QStringView output;
};
std::vector<TestCase> tests{
{u"foo bar", u"foo bar"},
{u"", u""},
{u"a", u"a"},
{u"\U000E0002", u"\u200D"},
{u"foo\U000E0002bar", u"foo\u200Dbar"},
{u"foo \U000E0002 bar", u"foo \u200D bar"},
{u"\U0001F468\U000E0002\U0001F33E", u"\U0001F468\u200D\U0001F33E"},
// don't replace ZWJ
{u"\U0001F468\u200D\U0001F33E", u"\U0001F468\u200D\U0001F33E"},
// only replace the first escape tag in sequences
{
u"\U0001F468\U000E0002\U000E0002\U0001F33E",
u"\U0001F468\u200D\U000E0002\U0001F33E",
},
{
u"\U0001F468\U000E0002\U000E0002\U000E0002\U0001F33E",
u"\U0001F468\u200D\U000E0002\U000E0002\U0001F33E",
},
};
// sanity check that the compiler supports unicode string literals
static_assert(
[] {
constexpr std::span zwj = u"\u200D";
static_assert(zwj.size() == 2);
static_assert(zwj[0] == u'\x200D');
static_assert(zwj[1] == u'\0');
constexpr std::span escapeTag = u"\U000E0002";
static_assert(escapeTag.size() == 3);
static_assert(escapeTag[0] == u'\xDB40');
static_assert(escapeTag[1] == u'\xDC02');
static_assert(escapeTag[2] == u'\0');
return true;
}(),
"The compiler must support Unicode string literals");
for (const auto &c : tests)
{
const auto actual = unescapeZeroWidthJoiner(c.input.toString());
EXPECT_EQ(actual, c.output);
}
}