Fix links being parsed as markdown links improperly (#7200)

* Fix links being parsed as markdown links improperly

Fixes #4674

* Fix a typo

* Fix overriding too much stuff

* Fix parsing

* Remove useless console.log

* Remove unnecessary emph function

* Properly fix tests

* Add some better docs

* Add missing license header
This commit is contained in:
Dariusz Niemczyk 2021-11-30 19:09:05 +01:00 committed by GitHub
parent 8fe582b094
commit e3187ed15c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 268 additions and 4 deletions

View file

@ -17,6 +17,8 @@ limitations under the License.
import * as commonmark from 'commonmark';
import { escape } from "lodash";
import { logger } from 'matrix-js-sdk/src/logger';
import * as linkify from 'linkifyjs';
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
link: (node: commonmark.Node, entering: boolean) => void;
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
text: (node: commonmark.Node) => void;
out: (text: string) => void;
emph: (node: commonmark.Node) => void;
}
function isAllowedHtmlTag(node: commonmark.Node): boolean {
@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
return par.firstChild != par.lastChild;
}
function getTextUntilEndOrLinebreak(node: commonmark.Node) {
let currentNode = node;
let text = '';
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
const { literal, type } = currentNode;
if (type === 'text' && literal) {
let n = 0;
let char = literal[n];
while (char !== ' ' && char !== null && n <= literal.length) {
if (char === ' ') {
break;
}
if (char) {
text += char;
}
n += 1;
char = literal[n];
}
if (char === ' ') {
break;
}
}
currentNode = currentNode.next;
}
return text;
}
/**
* Class that wraps commonmark, adding the ability to see whether
* a given message actually uses any markdown syntax or whether
@ -70,11 +102,103 @@ export default class Markdown {
private input: string;
private parsed: commonmark.Node;
constructor(input) {
constructor(input: string) {
this.input = input;
const parser = new commonmark.Parser();
this.parsed = parser.parse(this.input);
this.parsed = this.repairLinks(this.parsed);
}
/**
* This method is modifying the parsed AST in such a way that links are always
* properly linkified instead of sometimes being wrongly emphasised in case
* if you were to write a link like the example below:
* https://my_weird-link_domain.domain.com
* ^ this link would be parsed to something like this:
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
* This method makes it so the link gets properly modified to a version where it is
* not emphasised until it actually ends.
* See: https://github.com/vector-im/element-web/issues/4674
* @param parsed
*/
private repairLinks(parsed: commonmark.Node) {
const walker = parsed.walker();
let event: commonmark.NodeWalkingStep = null;
let text = '';
let isInPara = false;
let previousNode: commonmark.Node | null = null;
let shouldUnlinkEmphasisNode = false;
while ((event = walker.next())) {
const { node } = event;
if (node.type === 'paragraph') {
if (event.entering) {
isInPara = true;
} else {
isInPara = false;
}
}
if (isInPara) {
// Clear saved string when line ends
if (
node.type === 'softbreak' ||
node.type === 'linebreak' ||
// Also start calculating the text from the beginning on any spaces
(node.type === 'text' && node.literal === ' ')
) {
text = '';
}
if (node.type === 'text') {
text += node.literal;
}
// We should not do this if previous node was not a textnode, as we can't combine it then.
if (node.type === 'emph' && previousNode.type === 'text') {
if (event.entering) {
const foundLinks = linkify.find(text);
for (const { value } of foundLinks) {
if (node.firstChild.literal) {
/**
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
* but this solution seems to work well and is hopefully slightly easier to understand too
*/
const nonEmphasizedText = `_${node.firstChild.literal}_`;
const f = getTextUntilEndOrLinebreak(node);
const newText = value + nonEmphasizedText + f;
const newLinks = linkify.find(newText);
// Should always find only one link here, if it finds more it means that the algorithm is broken
if (newLinks.length === 1) {
const emphasisTextNode = new commonmark.Node('text');
emphasisTextNode.literal = nonEmphasizedText;
previousNode.insertAfter(emphasisTextNode);
node.firstChild.literal = '';
event = node.walker().next();
// Remove `em` opening and closing nodes
node.unlink();
previousNode.insertAfter(event.node);
shouldUnlinkEmphasisNode = true;
} else {
logger.error(
"Markdown links escaping found too many links for following text: ",
text,
);
logger.error(
"Markdown links escaping found too many links for modified text: ",
newText,
);
}
}
}
} else {
if (shouldUnlinkEmphasisNode) {
node.unlink();
shouldUnlinkEmphasisNode = false;
}
}
}
}
previousNode = node;
}
return parsed;
}
isPlainText(): boolean {
@ -120,9 +244,7 @@ export default class Markdown {
// you can nest them.
//
// Let's try sending with <p/>s anyway for now, though.
const realParagraph = renderer.paragraph;
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
// If there is only one top level node, just return the
// bare text: it's a single line of text and so should be