From fc90ed1c3ac557492aef95d3d87ebe8e486a1293 Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 2 Feb 2017 11:27:07 +0000 Subject: [PATCH 1/4] Update the comments in Markdown.js so they don't claim it;s a wrapper around marked when it's now commonmark, and comment why we render markdown to plaintext which is somewhat unintuitive. --- src/Markdown.js | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Markdown.js b/src/Markdown.js index 2f278183a3..c342e7cceb 100644 --- a/src/Markdown.js +++ b/src/Markdown.js @@ -17,7 +17,7 @@ limitations under the License. import commonmark from 'commonmark'; /** - * Class that wraps marked, adding the ability to see whether + * Class that wraps commonmark, adding the ability to see whether * a given message actually uses any markdown syntax or whether * it's plain text. */ @@ -33,11 +33,7 @@ export default class Markdown { // running the parser on the tokens with a dummy // rendered and seeing if any of the renderer's // functions are called other than those noted below. - // In case you were wondering, no we can't just examine - // the tokens because the tokens we have are only the - // output of the *first* tokenizer: any line-based - // markdown is processed by marked within Parser by - // the 'inline lexer'... + // TODO: can't we just examine the output of the parser? let is_plain = true; function setNotPlain() { @@ -85,6 +81,12 @@ export default class Markdown { return rendered; } + /* + * Render the mrkdown message to plain text. That is, essentially + * just remove any backslashes escaping what would otherwise be + * markdown syntax + * (to fix https://github.com/vector-im/riot-web/issues/2870) + */ toPlaintext() { const real_paragraph = this.renderer.paragraph; From 72e5b2235dbd4758f1c9162c39c34b21986604f7 Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 2 Feb 2017 11:34:39 +0000 Subject: [PATCH 2/4] Parse once and re-use the parsed output Rather than re-parsing the same output in each function --- src/Markdown.js | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Markdown.js b/src/Markdown.js index c342e7cceb..126e0583d1 100644 --- a/src/Markdown.js +++ b/src/Markdown.js @@ -24,7 +24,10 @@ import commonmark from 'commonmark'; export default class Markdown { constructor(input) { this.input = input; - this.parser = new commonmark.Parser(); + + const parser = new commonmark.Parser(); + this.parsed = parser.parse(this.input); + this.renderer = new commonmark.HtmlRenderer({safe: false}); } @@ -49,8 +52,7 @@ export default class Markdown { dummy_renderer.softbreak = function(t) { return t; }; dummy_renderer.paragraph = function(t) { return t; }; - const dummy_parser = new commonmark.Parser(); - dummy_renderer.render(dummy_parser.parse(this.input)); + dummy_renderer.render(this.parsed); return is_plain; } @@ -73,8 +75,7 @@ export default class Markdown { } }; - var parsed = this.parser.parse(this.input); - var rendered = this.renderer.render(parsed); + var rendered = this.renderer.render(this.parsed); this.renderer.paragraph = real_paragraph; @@ -116,8 +117,7 @@ export default class Markdown { } }; - var parsed = this.parser.parse(this.input); - var rendered = this.renderer.render(parsed); + var rendered = this.renderer.render(this.parsed); this.renderer.paragraph = real_paragraph; From 63e47d86776769934077fa86770dd53543eff803 Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 2 Feb 2017 11:45:21 +0000 Subject: [PATCH 3/4] Make ourselves a new rendered each time Rather than keeping one in memory, abusing it in different ways each time and then craefully putting it back the way it was (and in one case, failing, because we forgot to put the `out` method back). --- src/Markdown.js | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/Markdown.js b/src/Markdown.js index 126e0583d1..af103b7ad6 100644 --- a/src/Markdown.js +++ b/src/Markdown.js @@ -27,8 +27,6 @@ export default class Markdown { const parser = new commonmark.Parser(); this.parsed = parser.parse(this.input); - - this.renderer = new commonmark.HtmlRenderer({safe: false}); } isPlainText() { @@ -58,9 +56,10 @@ export default class Markdown { } toHTML() { - const real_paragraph = this.renderer.paragraph; + const renderer = new commonmark.HtmlRenderer({safe: false}); + const real_paragraph = renderer.paragraph; - this.renderer.paragraph = function(node, entering) { + renderer.paragraph = function(node, entering) { // If there is only one top level node, just return the // bare text: it's a single line of text and so should be // 'inline', rather than unnecessarily wrapped in its own @@ -75,11 +74,7 @@ export default class Markdown { } }; - var rendered = this.renderer.render(this.parsed); - - this.renderer.paragraph = real_paragraph; - - return rendered; + return renderer.render(this.parsed); } /* @@ -89,17 +84,18 @@ export default class Markdown { * (to fix https://github.com/vector-im/riot-web/issues/2870) */ toPlaintext() { - const real_paragraph = this.renderer.paragraph; + const renderer = new commonmark.HtmlRenderer({safe: false}); + const real_paragraph = renderer.paragraph; // The default `out` function only sends the input through an XML // escaping function, which causes messages to be entity encoded, // which we don't want in this case. - this.renderer.out = function(s) { + renderer.out = function(s) { // The `lit` function adds a string literal to the output buffer. this.lit(s); }; - this.renderer.paragraph = function(node, entering) { + renderer.paragraph = function(node, entering) { // If there is only one top level node, just return the // bare text: it's a single line of text and so should be // 'inline', rather than unnecessarily wrapped in its own @@ -117,10 +113,6 @@ export default class Markdown { } }; - var rendered = this.renderer.render(this.parsed); - - this.renderer.paragraph = real_paragraph; - - return rendered; + return renderer.render(this.parsed); } } From 853c89dfdcbf283db26a1f2d822c82289b56045d Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 2 Feb 2017 14:17:07 +0000 Subject: [PATCH 4/4] Fix spurious html tags like * Only render HTML tags in markdown if they're del tags * Consider non-allowed HTML tags as plain text nodes, so a message of just '' doesn't need to be sent as HTML * Consequently rewrite isPlaintext to just look at the parse tree rather than making and gutting a renderer to walk the tree (now we're using a library that actually produces a meaningfgul parse tree). * Tweak when we put \n on text output to avoid putting \n on the end of messages. Fixes https://github.com/vector-im/riot-web/issues/3065 --- src/Markdown.js | 117 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 39 deletions(-) diff --git a/src/Markdown.js b/src/Markdown.js index af103b7ad6..d6dc979a5a 100644 --- a/src/Markdown.js +++ b/src/Markdown.js @@ -15,6 +15,45 @@ limitations under the License. */ import commonmark from 'commonmark'; +import escape from 'lodash/escape'; + +const ALLOWED_HTML_TAGS = ['del']; + +// These types of node are definitely text +const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document']; + +function is_allowed_html_tag(node) { + // Regex won't work for tags with attrs, but we only + // allow anyway. + const matches = /^<\/?(.*)>$/.exec(node.literal); + if (matches && matches.length == 2) { + const tag = matches[1]; + return ALLOWED_HTML_TAGS.indexOf(tag) > -1; + } + return false; +} + +function html_if_tag_allowed(node) { + if (is_allowed_html_tag(node)) { + this.lit(node.literal); + return; + } else { + this.lit(escape(node.literal)); + } +} + +/* + * Returns true if the parse output containing the node + * comprises multiple block level elements (ie. lines), + * or false if it is only a single line. + */ +function is_multi_line(node) { + var par = node; + while (par.parent) { + par = par.parent; + } + return par.firstChild != par.lastChild; +} /** * Class that wraps commonmark, adding the ability to see whether @@ -30,29 +69,26 @@ export default class Markdown { } isPlainText() { - // we determine if the message requires markdown by - // running the parser on the tokens with a dummy - // rendered and seeing if any of the renderer's - // functions are called other than those noted below. - // TODO: can't we just examine the output of the parser? - let is_plain = true; + const walker = this.parsed.walker(); - function setNotPlain() { - is_plain = false; + let ev; + while ( (ev = walker.next()) ) { + const node = ev.node; + if (TEXT_NODES.indexOf(node.type) > -1) { + // definitely text + continue; + } else if (node.type == 'html_inline' || node.type == 'html_block') { + // if it's an allowed html tag, we need to render it and therefore + // we will need to use HTML. If it's not allowed, it's not HTML since + // we'll just be treating it as text. + if (is_allowed_html_tag(node)) { + return false; + } + } else { + return false; + } } - - const dummy_renderer = new commonmark.HtmlRenderer(); - for (const k of Object.keys(commonmark.HtmlRenderer.prototype)) { - dummy_renderer[k] = setNotPlain; - } - // text and paragraph are just text - dummy_renderer.text = function(t) { return t; }; - dummy_renderer.softbreak = function(t) { return t; }; - dummy_renderer.paragraph = function(t) { return t; }; - - dummy_renderer.render(this.parsed); - - return is_plain; + return true; } toHTML() { @@ -65,20 +101,27 @@ export default class Markdown { // 'inline', rather than unnecessarily wrapped in its own // p tag. If, however, we have multiple nodes, each gets // its own p tag to keep them as separate paragraphs. - var par = node; - while (par.parent) { - par = par.parent; - } - if (par.firstChild != par.lastChild) { + if (is_multi_line(node)) { real_paragraph.call(this, node, entering); } }; + renderer.html_inline = html_if_tag_allowed; + renderer.html_block = function(node) { + // as with `paragraph`, we only insert line breaks + // if there are multiple lines in the markdown. + const isMultiLine = is_multi_line(node); + + if (isMultiLine) this.cr(); + html_if_tag_allowed.call(this, node); + if (isMultiLine) this.cr(); + } + return renderer.render(this.parsed); } /* - * Render the mrkdown message to plain text. That is, essentially + * Render the markdown message to plain text. That is, essentially * just remove any backslashes escaping what would otherwise be * markdown syntax * (to fix https://github.com/vector-im/riot-web/issues/2870) @@ -96,22 +139,18 @@ export default class Markdown { }; renderer.paragraph = function(node, entering) { - // If there is only one top level node, just return the - // bare text: it's a single line of text and so should be - // 'inline', rather than unnecessarily wrapped in its own - // p tag. If, however, we have multiple nodes, each gets - // its own p tag to keep them as separate paragraphs. - var par = node; - while (par.parent) { - node = par; - par = par.parent; - } - if (node != par.lastChild) { - if (!entering) { + // as with toHTML, only append lines to paragraphs if there are + // multiple paragraphs + if (is_multi_line(node)) { + if (!entering && node.next) { this.lit('\n\n'); } } }; + renderer.html_block = function(node) { + this.lit(node.literal); + if (is_multi_line(node) && node.next) this.lit('\n\n'); + } return renderer.render(this.parsed); }