diff options
Diffstat (limited to 'packages/integrations/markdoc/src/html/transform/html-token-transform.ts')
-rw-r--r-- | packages/integrations/markdoc/src/html/transform/html-token-transform.ts | 437 |
1 files changed, 214 insertions, 223 deletions
diff --git a/packages/integrations/markdoc/src/html/transform/html-token-transform.ts b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts index 6b2838ac3..0d5dcfb81 100644 --- a/packages/integrations/markdoc/src/html/transform/html-token-transform.ts +++ b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts @@ -1,256 +1,247 @@ -import type * as Token from 'markdown-it/lib/token'; -import { Parser } from 'htmlparser2'; import { Tokenizer } from '@markdoc/markdoc'; - +import { Parser } from 'htmlparser2'; +import type * as Token from 'markdown-it/lib/token'; export function htmlTokenTransform(tokenizer: Tokenizer, tokens: Token[]): Token[] { - - const output: Token[] = []; - - // hold a lazy buffer of text and process it only when necessary - let textBuffer = ''; - - let inCDATA = false; - - const appendText = (text: string) => { - textBuffer += text; - }; - - // process the current text buffer w/ Markdoc's Tokenizer for tokens - const processTextBuffer = () => { - - if (textBuffer.length > 0) { - - // tokenize the text buffer to look for structural markup tokens - const toks = tokenizer.tokenize(textBuffer); - - // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want - // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup - if (toks.length === 3) { - - const first = toks[0]; - const second = toks[1]; - const third: Token | undefined = toks.at(2); - - if (first.type === 'paragraph_open' && second.type === 'inline' && (third && third.type === 'paragraph_close') && Array.isArray(second.children)) { - for (const tok of second.children as Token[]) { - // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original - // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content - if (tok.type === 'text') { - if (tok.content.trim() == textBuffer.trim()) { - tok.content = textBuffer; - } - } - output.push(tok); - } - } else { - // some other markup that happened to be 3 tokens, push tokens as-is - for (const tok of toks) { - output.push(tok); - } - } - } else { - // some other tokenized markup, push tokens as-is - for (const tok of toks) { - output.push(tok); - } - } - - // reset the current lazy text buffer - textBuffer = ''; - } - }; - - // create an incremental HTML parser that tracks HTML tag open, close and text content - const parser = new Parser({ - - oncdatastart() { - inCDATA = true; - }, - - oncdataend() { - inCDATA = false; - }, - - // when an HTML tag opens... - onopentag(name, attrs) { - - // process any buffered text to be treated as text node before the currently opening HTML tag - processTextBuffer(); - - // push an 'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack - output.push({ - type: 'tag_open', - nesting: 1, - meta: { - tag: 'html-tag', - attributes: [ - { type: 'attribute', name: 'name', value: name }, - { type: 'attribute', name: 'attrs', value: attrs }, - ], - }, - } as Token); - - }, - - ontext(content: string | null | undefined) { - - if (inCDATA) { - // ignore entirely while inside CDATA - return; - } - - // only accumulate text into the buffer if we're not under an ignored HTML element - if (typeof content === 'string') { - appendText(content); - } - }, - - // when an HTML tag closes... - onclosetag(name) { - - // process any buffered text to be treated as a text node inside the currently closing HTML tag - processTextBuffer(); - - // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack - output.push({ - type: 'tag_close', - nesting: -1, - meta: { - tag: 'html-tag', - attributes: [ - { type: 'attribute', name: 'name', value: name }, - ], - }, - } as Token); - - }, - - }, { - decodeEntities: false, - recognizeCDATA: true, - recognizeSelfClosing: true, - }); - - // for every detected token... - for (const token of tokens) { - - // if it was an HTML token, write the HTML text into the HTML parser - if (token.type.startsWith('html')) { - - // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack - parser.write(token.content); - - // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML - continue; - } - - // process any child content for HTML - if (token.type === 'inline') { - if (token.children) { - token.children = htmlTokenTransform(tokenizer, token.children); - } - } - - // not an HTML Token, preserve it at the current stack location - output.push(token); - } - - // process any remaining buffered text - processTextBuffer(); - - // - // post-process the current levels output Token[] array to un-wind this pattern: - // - // [ - // { type: tag_open, meta.tag: html-tag }, - // { type: paragraph_open }, - // { type: inline, children [...] }, - // { type: paragraph_close }, - // { type: tag_close, meta.tag: html-tag } - // ] - // - // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node - // - // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled - // - - mutateAndCollapseExtraParagraphsUnderHtml(output); - - return output; + const output: Token[] = []; + + // hold a lazy buffer of text and process it only when necessary + let textBuffer = ''; + + let inCDATA = false; + + const appendText = (text: string) => { + textBuffer += text; + }; + + // process the current text buffer w/ Markdoc's Tokenizer for tokens + const processTextBuffer = () => { + if (textBuffer.length > 0) { + // tokenize the text buffer to look for structural markup tokens + const toks = tokenizer.tokenize(textBuffer); + + // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want + // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup + if (toks.length === 3) { + const first = toks[0]; + const second = toks[1]; + const third: Token | undefined = toks.at(2); + + if ( + first.type === 'paragraph_open' && + second.type === 'inline' && + third && + third.type === 'paragraph_close' && + Array.isArray(second.children) + ) { + for (const tok of second.children as Token[]) { + // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original + // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content + if (tok.type === 'text') { + if (tok.content.trim() == textBuffer.trim()) { + tok.content = textBuffer; + } + } + output.push(tok); + } + } else { + // some other markup that happened to be 3 tokens, push tokens as-is + for (const tok of toks) { + output.push(tok); + } + } + } else { + // some other tokenized markup, push tokens as-is + for (const tok of toks) { + output.push(tok); + } + } + + // reset the current lazy text buffer + textBuffer = ''; + } + }; + + // create an incremental HTML parser that tracks HTML tag open, close and text content + const parser = new Parser( + { + oncdatastart() { + inCDATA = true; + }, + + oncdataend() { + inCDATA = false; + }, + + // when an HTML tag opens... + onopentag(name, attrs) { + // process any buffered text to be treated as text node before the currently opening HTML tag + processTextBuffer(); + + // push an 'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack + output.push({ + type: 'tag_open', + nesting: 1, + meta: { + tag: 'html-tag', + attributes: [ + { type: 'attribute', name: 'name', value: name }, + { type: 'attribute', name: 'attrs', value: attrs }, + ], + }, + } as Token); + }, + + ontext(content: string | null | undefined) { + if (inCDATA) { + // ignore entirely while inside CDATA + return; + } + + // only accumulate text into the buffer if we're not under an ignored HTML element + if (typeof content === 'string') { + appendText(content); + } + }, + + // when an HTML tag closes... + onclosetag(name) { + // process any buffered text to be treated as a text node inside the currently closing HTML tag + processTextBuffer(); + + // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack + output.push({ + type: 'tag_close', + nesting: -1, + meta: { + tag: 'html-tag', + attributes: [{ type: 'attribute', name: 'name', value: name }], + }, + } as Token); + }, + }, + { + decodeEntities: false, + recognizeCDATA: true, + recognizeSelfClosing: true, + } + ); + + // for every detected token... + for (const token of tokens) { + // if it was an HTML token, write the HTML text into the HTML parser + if (token.type.startsWith('html')) { + // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack + parser.write(token.content); + + // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML + continue; + } + + // process any child content for HTML + if (token.type === 'inline') { + if (token.children) { + token.children = htmlTokenTransform(tokenizer, token.children); + } + } + + // not an HTML Token, preserve it at the current stack location + output.push(token); + } + + // process any remaining buffered text + processTextBuffer(); + + // + // post-process the current levels output Token[] array to un-wind this pattern: + // + // [ + // { type: tag_open, meta.tag: html-tag }, + // { type: paragraph_open }, + // { type: inline, children [...] }, + // { type: paragraph_close }, + // { type: tag_close, meta.tag: html-tag } + // ] + // + // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node + // + // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled + // + + mutateAndCollapseExtraParagraphsUnderHtml(output); + + return output; } function mutateAndCollapseExtraParagraphsUnderHtml(tokens: Token[]): void { - let done = false; + let done = false; - while (!done) { - const idx = findExtraParagraphUnderHtml(tokens); - if (typeof idx === 'number') { - // mutate + while (!done) { + const idx = findExtraParagraphUnderHtml(tokens); + if (typeof idx === 'number') { + // mutate - const actualChildTokens = tokens[idx + 2].children ?? []; + const actualChildTokens = tokens[idx + 2].children ?? []; - tokens.splice(idx, 5, ...actualChildTokens); - } else { - done = true; - } - } + tokens.splice(idx, 5, ...actualChildTokens); + } else { + done = true; + } + } } - /** - * - * @param token - * @returns + * + * @param token + * @returns */ function findExtraParagraphUnderHtml(tokens: Token[]): number | null { - - if (tokens.length < 5) { - return null; - } - - for (let i = 0; i < tokens.length; i++) { - const last = i + 4; - if (last > tokens.length - 1) { - break; // early exit, no more possible 5-long slices to search - } - - const slice = tokens.slice(i, last + 1); - const isMatch = isExtraParagraphPatternMatch(slice); - if (isMatch) { - return i; - } - } - - return null; + if (tokens.length < 5) { + return null; + } + + for (let i = 0; i < tokens.length; i++) { + const last = i + 4; + if (last > tokens.length - 1) { + break; // early exit, no more possible 5-long slices to search + } + + const slice = tokens.slice(i, last + 1); + const isMatch = isExtraParagraphPatternMatch(slice); + if (isMatch) { + return i; + } + } + + return null; } function isExtraParagraphPatternMatch(slice: Token[]): boolean { - const match = isHtmlTagOpen(slice[0]) - && isParagraphOpen(slice[1]) - && isInline(slice[2]) - && isParagraphClose(slice[3]) - && isHtmlTagClose(slice[4]); - return match; + const match = + isHtmlTagOpen(slice[0]) && + isParagraphOpen(slice[1]) && + isInline(slice[2]) && + isParagraphClose(slice[3]) && + isHtmlTagClose(slice[4]); + return match; } - function isHtmlTagOpen(token: Token): boolean { - return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag'; + return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag'; } function isHtmlTagClose(token: Token): boolean { - return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag'; + return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag'; } function isParagraphOpen(token: Token): boolean { - return token.type === 'paragraph_open'; + return token.type === 'paragraph_open'; } function isParagraphClose(token: Token): boolean { - return token.type === 'paragraph_close'; + return token.type === 'paragraph_close'; } function isInline(token: Token): boolean { - return token.type === 'inline'; + return token.type === 'inline'; } |