Don't consider textual characters to be emoji (#12582)
* Don't consider textual characters to be emoji We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like '↔' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (https://github.com/milesj/emojibase/issues/174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done. * Add a fallback for BIGEMOJI_REGEX as well
This commit is contained in:
parent
489bc32674
commit
c61eca8c24
6 changed files with 98 additions and 12 deletions
|
@ -20,7 +20,6 @@ limitations under the License.
|
|||
import React, { LegacyRef, ReactNode } from "react";
|
||||
import sanitizeHtml from "sanitize-html";
|
||||
import classNames from "classnames";
|
||||
import EMOJIBASE_REGEX from "emojibase-regex";
|
||||
import katex from "katex";
|
||||
import { decode } from "html-entities";
|
||||
import { IContent } from "matrix-js-sdk/src/matrix";
|
||||
|
@ -46,10 +45,35 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/;
|
|||
const SYMBOL_PATTERN = /([\u2100-\u2bff])/;
|
||||
|
||||
// Regex pattern for non-emoji characters that can appear in an "all-emoji" message
|
||||
// (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace)
|
||||
const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g;
|
||||
// (Zero-Width Space, other whitespace)
|
||||
const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g;
|
||||
|
||||
const BIGEMOJI_REGEX = new RegExp(`^(${EMOJIBASE_REGEX.source})+$`, "i");
|
||||
// Regex for emoji. This includes any RGI_Emoji sequence followed by an optional
|
||||
// emoji presentation VS (U+FE0F), but not those sequences that are followed by
|
||||
// a text presentation VS (U+FE0E). We also count lone regional indicators
|
||||
// (U+1F1E6-U+1F1FF). Technically this regex produces false negatives for emoji
|
||||
// followed by U+FE0E when the emoji doesn't have a text variant, but in
|
||||
// practice this doesn't matter.
|
||||
export const EMOJI_REGEX = (() => {
|
||||
try {
|
||||
// Per our support policy, v mode is available to us, but we still don't
|
||||
// want the app to completely crash on older platforms. We use the
|
||||
// constructor here to avoid a syntax error on such platforms.
|
||||
return new RegExp("\\p{RGI_Emoji}(?!\\uFE0E)(?:(?<!\\uFE0F)\\uFE0F)?|[\\u{1f1e6}-\\u{1f1ff}]", "v");
|
||||
} catch (_e) {
|
||||
// v mode not supported; fall back to matching nothing
|
||||
return /(?!)/;
|
||||
}
|
||||
})();
|
||||
|
||||
const BIGEMOJI_REGEX = (() => {
|
||||
try {
|
||||
return new RegExp(`^(${EMOJI_REGEX.source})+$`, "iv");
|
||||
} catch (_e) {
|
||||
// Fall back, just like for EMOJI_REGEX
|
||||
return /(?!)/;
|
||||
}
|
||||
})();
|
||||
|
||||
/*
|
||||
* Return true if the given string contains emoji
|
||||
|
@ -266,7 +290,7 @@ export function formatEmojis(message: string | undefined, isHtmlMessage?: boolea
|
|||
let key = 0;
|
||||
|
||||
for (const data of graphemeSegmenter.segment(message)) {
|
||||
if (EMOJIBASE_REGEX.test(data.segment)) {
|
||||
if (EMOJI_REGEX.test(data.segment)) {
|
||||
if (text) {
|
||||
result.push(text);
|
||||
text = "";
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue