Switch from cheerio to DOMParser (#10929)

* Add tests around feature_latex_maths * Switch from cheerio to DOMParser * strict * Iterate
2023-05-23 14:31:05 +01:00 · 2023-05-23 14:31:05 +01:00 · 72d1bd910a
commit 72d1bd910a
parent 151b0efe73
7 changed files with 98 additions and 99 deletions
--- a/src/HtmlUtils.tsx
+++ b/src/HtmlUtils.tsx
@ -19,7 +19,6 @@ limitations under the License.

 import React, { LegacyRef, ReactElement, ReactNode } from "react";
 import sanitizeHtml from "sanitize-html";
-import { load as cheerio } from "cheerio";
 import classNames from "classnames";
 import EMOJIBASE_REGEX from "emojibase-regex";
 import { merge, split } from "lodash";
@ -549,27 +548,19 @@ export function bodyToHtml(content: IContent, highlights: Optional<string[]>, op
            }

            safeBody = sanitizeHtml(formattedBody!, sanitizeParams);
-            const phtml = cheerio(safeBody, {
-                // @ts-ignore: The `_useHtmlParser2` internal option is the
-                // simplest way to both parse and render using `htmlparser2`.
-                _useHtmlParser2: true,
-                decodeEntities: false,
-            });
-            const isPlainText = phtml.html() === phtml.root().text();
+            const phtml = new DOMParser().parseFromString(safeBody, "text/html");
+            const isPlainText = phtml.body.innerHTML === phtml.body.textContent;
            isHtmlMessage = !isPlainText;

            if (isHtmlMessage && SettingsStore.getValue("feature_latex_maths")) {
-                // @ts-ignore - The types for `replaceWith` wrongly expect
-                // Cheerio instance to be returned.
-                phtml('div, span[data-mx-maths!=""]').replaceWith(function (i, e) {
-                    return katex.renderToString(decode(phtml(e).attr("data-mx-maths")), {
+                [...phtml.querySelectorAll<HTMLElement>("div, span[data-mx-maths]")].forEach((e) => {
+                    e.outerHTML = katex.renderToString(decode(e.getAttribute("data-mx-maths")), {
                        throwOnError: false,
-                        // @ts-ignore - `e` can be an Element, not just a Node
-                        displayMode: e.name == "div",
+                        displayMode: e.tagName == "DIV",
                        output: "htmlAndMathml",
                    });
                });
-                safeBody = phtml.html();
+                safeBody = phtml.body.innerHTML;
            }
        } else if (highlighter) {
            safeBody = highlighter.applyHighlights(escapeHtml(plainBody), safeHighlights!).join("");
--- a/src/editor/serialize.ts
+++ b/src/editor/serialize.ts
@ -16,7 +16,6 @@ limitations under the License.
 */

 import { encode } from "html-entities";
-import { load as cheerio } from "cheerio";
 import escapeHtml from "escape-html";

 import Markdown from "../Markdown";
@ -133,8 +132,7 @@ export function htmlSerializeFromMdIfNeeded(md: string, { forceHTML = false } =
            });
        });

-        // make sure div tags always start on a new line, otherwise it will confuse
-        // the markdown parser
+        // make sure div tags always start on a new line, otherwise it will confuse the markdown parser
        md = md.replace(/(.)<div/g, function (m, p1) {
            return `${p1}\n<div`;
        });
@ -143,39 +141,29 @@ export function htmlSerializeFromMdIfNeeded(md: string, { forceHTML = false } =
    const parser = new Markdown(md);
    if (!parser.isPlainText() || forceHTML) {
        // feed Markdown output to HTML parser
-        const phtml = cheerio(parser.toHTML(), {
-            // @ts-ignore: The `_useHtmlParser2` internal option is the
-            // simplest way to both parse and render using `htmlparser2`.
-            _useHtmlParser2: true,
-            decodeEntities: false,
-        });
+        const phtml = new DOMParser().parseFromString(parser.toHTML(), "text/html");

        if (SettingsStore.getValue("feature_latex_maths")) {
            // original Markdown without LaTeX replacements
            const parserOrig = new Markdown(orig);
-            const phtmlOrig = cheerio(parserOrig.toHTML(), {
-                // @ts-ignore: The `_useHtmlParser2` internal option is the
-                // simplest way to both parse and render using `htmlparser2`.
-                _useHtmlParser2: true,
-                decodeEntities: false,
-            });
+            const phtmlOrig = new DOMParser().parseFromString(parserOrig.toHTML(), "text/html");

            // since maths delimiters are handled before Markdown,
            // code blocks could contain mangled content.
            // replace code blocks with original content
-            phtmlOrig("code").each(function (i) {
-                phtml("code").eq(i).text(phtmlOrig("code").eq(i).text());
+            [...phtmlOrig.getElementsByTagName("code")].forEach((e, i) => {
+                phtml.getElementsByTagName("code").item(i)!.textContent = e.textContent;
            });

            // add fallback output for latex math, which should not be interpreted as markdown
-            phtml("div, span").each(function (i, e) {
-                const tex = phtml(e).attr("data-mx-maths");
+            [...phtml.querySelectorAll("div, span")].forEach((e, i) => {
+                const tex = e.getAttribute("data-mx-maths");
                if (tex) {
-                    phtml(e).html(`<code>${tex}</code>`);
+                    e.innerHTML = `<code>${tex}</code>`;
                }
            });
        }
-        return phtml.html();
+        return phtml.body.innerHTML;
    }
    // ensure removal of escape backslashes in non-Markdown messages
    if (md.indexOf("\\") > -1) {