Fix links being parsed as markdown links improperly (#7200)
* Fix links being parsed as markdown links improperly Fixes #4674 * Fix a typo * Fix overriding too much stuff * Fix parsing * Remove useless console.log * Remove unnecessary emph function * Properly fix tests * Add some better docs * Add missing license header
This commit is contained in:
parent
8fe582b094
commit
e3187ed15c
3 changed files with 268 additions and 4 deletions
128
src/Markdown.ts
128
src/Markdown.ts
|
@ -17,6 +17,8 @@ limitations under the License.
|
|||
|
||||
import * as commonmark from 'commonmark';
|
||||
import { escape } from "lodash";
|
||||
import { logger } from 'matrix-js-sdk/src/logger';
|
||||
import * as linkify from 'linkifyjs';
|
||||
|
||||
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
|
||||
|
||||
|
@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
|
|||
link: (node: commonmark.Node, entering: boolean) => void;
|
||||
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
|
||||
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
|
||||
text: (node: commonmark.Node) => void;
|
||||
out: (text: string) => void;
|
||||
emph: (node: commonmark.Node) => void;
|
||||
}
|
||||
|
||||
function isAllowedHtmlTag(node: commonmark.Node): boolean {
|
||||
|
@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
|
|||
return par.firstChild != par.lastChild;
|
||||
}
|
||||
|
||||
function getTextUntilEndOrLinebreak(node: commonmark.Node) {
|
||||
let currentNode = node;
|
||||
let text = '';
|
||||
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
|
||||
const { literal, type } = currentNode;
|
||||
if (type === 'text' && literal) {
|
||||
let n = 0;
|
||||
let char = literal[n];
|
||||
while (char !== ' ' && char !== null && n <= literal.length) {
|
||||
if (char === ' ') {
|
||||
break;
|
||||
}
|
||||
if (char) {
|
||||
text += char;
|
||||
}
|
||||
n += 1;
|
||||
char = literal[n];
|
||||
}
|
||||
if (char === ' ') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentNode = currentNode.next;
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class that wraps commonmark, adding the ability to see whether
|
||||
* a given message actually uses any markdown syntax or whether
|
||||
|
@ -70,11 +102,103 @@ export default class Markdown {
|
|||
private input: string;
|
||||
private parsed: commonmark.Node;
|
||||
|
||||
constructor(input) {
|
||||
constructor(input: string) {
|
||||
this.input = input;
|
||||
|
||||
const parser = new commonmark.Parser();
|
||||
this.parsed = parser.parse(this.input);
|
||||
this.parsed = this.repairLinks(this.parsed);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is modifying the parsed AST in such a way that links are always
|
||||
* properly linkified instead of sometimes being wrongly emphasised in case
|
||||
* if you were to write a link like the example below:
|
||||
* https://my_weird-link_domain.domain.com
|
||||
* ^ this link would be parsed to something like this:
|
||||
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
|
||||
* This method makes it so the link gets properly modified to a version where it is
|
||||
* not emphasised until it actually ends.
|
||||
* See: https://github.com/vector-im/element-web/issues/4674
|
||||
* @param parsed
|
||||
*/
|
||||
private repairLinks(parsed: commonmark.Node) {
|
||||
const walker = parsed.walker();
|
||||
let event: commonmark.NodeWalkingStep = null;
|
||||
let text = '';
|
||||
let isInPara = false;
|
||||
let previousNode: commonmark.Node | null = null;
|
||||
let shouldUnlinkEmphasisNode = false;
|
||||
while ((event = walker.next())) {
|
||||
const { node } = event;
|
||||
if (node.type === 'paragraph') {
|
||||
if (event.entering) {
|
||||
isInPara = true;
|
||||
} else {
|
||||
isInPara = false;
|
||||
}
|
||||
}
|
||||
if (isInPara) {
|
||||
// Clear saved string when line ends
|
||||
if (
|
||||
node.type === 'softbreak' ||
|
||||
node.type === 'linebreak' ||
|
||||
// Also start calculating the text from the beginning on any spaces
|
||||
(node.type === 'text' && node.literal === ' ')
|
||||
) {
|
||||
text = '';
|
||||
}
|
||||
if (node.type === 'text') {
|
||||
text += node.literal;
|
||||
}
|
||||
// We should not do this if previous node was not a textnode, as we can't combine it then.
|
||||
if (node.type === 'emph' && previousNode.type === 'text') {
|
||||
if (event.entering) {
|
||||
const foundLinks = linkify.find(text);
|
||||
for (const { value } of foundLinks) {
|
||||
if (node.firstChild.literal) {
|
||||
/**
|
||||
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
|
||||
* but this solution seems to work well and is hopefully slightly easier to understand too
|
||||
*/
|
||||
const nonEmphasizedText = `_${node.firstChild.literal}_`;
|
||||
const f = getTextUntilEndOrLinebreak(node);
|
||||
const newText = value + nonEmphasizedText + f;
|
||||
const newLinks = linkify.find(newText);
|
||||
// Should always find only one link here, if it finds more it means that the algorithm is broken
|
||||
if (newLinks.length === 1) {
|
||||
const emphasisTextNode = new commonmark.Node('text');
|
||||
emphasisTextNode.literal = nonEmphasizedText;
|
||||
previousNode.insertAfter(emphasisTextNode);
|
||||
node.firstChild.literal = '';
|
||||
event = node.walker().next();
|
||||
// Remove `em` opening and closing nodes
|
||||
node.unlink();
|
||||
previousNode.insertAfter(event.node);
|
||||
shouldUnlinkEmphasisNode = true;
|
||||
} else {
|
||||
logger.error(
|
||||
"Markdown links escaping found too many links for following text: ",
|
||||
text,
|
||||
);
|
||||
logger.error(
|
||||
"Markdown links escaping found too many links for modified text: ",
|
||||
newText,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (shouldUnlinkEmphasisNode) {
|
||||
node.unlink();
|
||||
shouldUnlinkEmphasisNode = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
previousNode = node;
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
isPlainText(): boolean {
|
||||
|
@ -120,9 +244,7 @@ export default class Markdown {
|
|||
// you can nest them.
|
||||
//
|
||||
// Let's try sending with <p/>s anyway for now, though.
|
||||
|
||||
const realParagraph = renderer.paragraph;
|
||||
|
||||
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
|
||||
// If there is only one top level node, just return the
|
||||
// bare text: it's a single line of text and so should be
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue