Don't consider textual characters to be emoji (#12582)

* Don't consider textual characters to be emoji We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like '↔' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (https://github.com/milesj/emojibase/issues/174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done. * Add a fallback for BIGEMOJI_REGEX as well
2024-07-04 13:48:07 -04:00 · 2024-07-04 13:48:07 -04:00 · c61eca8c24
commit c61eca8c24
parent 489bc32674
6 changed files with 98 additions and 12 deletions
--- a/src/HtmlUtils.tsx
+++ b/src/HtmlUtils.tsx
@ -20,7 +20,6 @@ limitations under the License.
 import React, { LegacyRef, ReactNode } from "react";
 import sanitizeHtml from "sanitize-html";
 import classNames from "classnames";
-import EMOJIBASE_REGEX from "emojibase-regex";
 import katex from "katex";
 import { decode } from "html-entities";
 import { IContent } from "matrix-js-sdk/src/matrix";
@ -46,10 +45,35 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/;
 const SYMBOL_PATTERN = /([\u2100-\u2bff])/;

 // Regex pattern for non-emoji characters that can appear in an "all-emoji" message
-// (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace)
-const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g;
+// (Zero-Width Space, other whitespace)
+const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g;

-const BIGEMOJI_REGEX = new RegExp(`^(${EMOJIBASE_REGEX.source})+$`, "i");
+// Regex for emoji. This includes any RGI_Emoji sequence followed by an optional
+// emoji presentation VS (U+FE0F), but not those sequences that are followed by
+// a text presentation VS (U+FE0E). We also count lone regional indicators
+// (U+1F1E6-U+1F1FF). Technically this regex produces false negatives for emoji
+// followed by U+FE0E when the emoji doesn't have a text variant, but in
+// practice this doesn't matter.
+export const EMOJI_REGEX = (() => {
+    try {
+        // Per our support policy, v mode is available to us, but we still don't
+        // want the app to completely crash on older platforms. We use the
+        // constructor here to avoid a syntax error on such platforms.
+        return new RegExp("\\p{RGI_Emoji}(?!\\uFE0E)(?:(?<!\\uFE0F)\\uFE0F)?|[\\u{1f1e6}-\\u{1f1ff}]", "v");
+    } catch (_e) {
+        // v mode not supported; fall back to matching nothing
+        return /(?!)/;
+    }
+})();
+
+const BIGEMOJI_REGEX = (() => {
+    try {
+        return new RegExp(`^(${EMOJI_REGEX.source})+$`, "iv");
+    } catch (_e) {
+        // Fall back, just like for EMOJI_REGEX
+        return /(?!)/;
+    }
+})();

 /*
 * Return true if the given string contains emoji
@ -266,7 +290,7 @@ export function formatEmojis(message: string | undefined, isHtmlMessage?: boolea
    let key = 0;

    for (const data of graphemeSegmenter.segment(message)) {
-        if (EMOJIBASE_REGEX.test(data.segment)) {
+        if (EMOJI_REGEX.test(data.segment)) {
            if (text) {
                result.push(text);
                text = "";
--- a/src/components/views/rooms/SendMessageComposer.tsx
+++ b/src/components/views/rooms/SendMessageComposer.tsx
@ -15,7 +15,6 @@ limitations under the License.
 */

 import React, { createRef, KeyboardEvent, SyntheticEvent } from "react";
-import EMOJI_REGEX from "emojibase-regex";
 import {
    IContent,
    MatrixEvent,
@ -70,6 +69,7 @@ import { doMaybeLocalRoomAction } from "../../../utils/local-room";
 import { Caret } from "../../../editor/caret";
 import { IDiff } from "../../../editor/diff";
 import { getBlobSafeMimeType } from "../../../utils/blobs";
+import { EMOJI_REGEX } from "../../../HtmlUtils";

 /**
 * Build the mentions information based on the editor model (and any related events):
--- a/src/editor/parts.ts
+++ b/src/editor/parts.ts
@ -15,11 +15,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-import EMOJIBASE_REGEX from "emojibase-regex";
 import { MatrixClient, RoomMember, Room } from "matrix-js-sdk/src/matrix";

 import AutocompleteWrapperModel, { GetAutocompleterComponent, UpdateCallback, UpdateQuery } from "./autocomplete";
-import { unicodeToShortcode } from "../HtmlUtils";
+import { EMOJI_REGEX, unicodeToShortcode } from "../HtmlUtils";
 import * as Avatar from "../Avatar";
 import defaultDispatcher from "../dispatcher/dispatcher";
 import { Action } from "../dispatcher/actions";
@ -197,7 +196,7 @@ abstract class BasePart {

 abstract class PlainBasePart extends BasePart {
    protected acceptsInsertion(chr: string, offset: number, inputType: string): boolean {
-        if (chr === "\n" || EMOJIBASE_REGEX.test(chr)) {
+        if (chr === "\n" || EMOJI_REGEX.test(chr)) {
            return false;
        }
        // when not pasting or dropping text, reject characters that should start a pill candidate
@ -375,7 +374,7 @@ class NewlinePart extends BasePart implements IBasePart {

 export class EmojiPart extends BasePart implements IBasePart {
    protected acceptsInsertion(chr: string, offset: number): boolean {
-        return EMOJIBASE_REGEX.test(chr);
+        return EMOJI_REGEX.test(chr);
    }

    protected acceptsRemoval(position: number, chr: string): boolean {
@ -573,7 +572,7 @@ export class PartCreator {
            case "\n":
                return new NewlinePart();
            default:
-                if (EMOJIBASE_REGEX.test(getFirstGrapheme(input))) {
+                if (EMOJI_REGEX.test(getFirstGrapheme(input))) {
                    return new EmojiPart();
                }
                return new PlainPart();
@ -650,7 +649,7 @@ export class PartCreator {
        let plainText = "";

        for (const data of graphemeSegmenter.segment(text)) {
-            if (EMOJIBASE_REGEX.test(data.segment)) {
+            if (EMOJI_REGEX.test(data.segment)) {
                if (plainText) {
                    parts.push(this.plain(plainText));
                    plainText = "";