import type { Tokenizer } from '@markdoc/markdoc'; import { Parser } from 'htmlparser2'; // @ts-expect-error This type isn't exported // biome-ignore lint/correctness/noUnusedImports: not correctly detected because type isn't exported import type * as Token from 'markdown-it/lib/token'; export function htmlTokenTransform(tokenizer: Tokenizer, tokens: Token[]): Token[] { const output: Token[] = []; // hold a lazy buffer of text and process it only when necessary let textBuffer = ''; let inCDATA = false; const appendText = (text: string) => { textBuffer += text; }; // process the current text buffer w/ Markdoc's Tokenizer for tokens const processTextBuffer = () => { if (textBuffer.length > 0) { // tokenize the text buffer to look for structural markup tokens const toks = tokenizer.tokenize(textBuffer); // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup if (toks.length === 3) { const first = toks[0]; const second = toks[1]; const third: Token | undefined = toks.at(2); if ( first.type === 'paragraph_open' && second.type === 'inline' && third && third.type === 'paragraph_close' && Array.isArray(second.children) ) { for (const tok of second.children as Token[]) { // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content if (tok.type === 'text') { if (tok.content.trim() == textBuffer.trim()) { tok.content = textBuffer; } } output.push(tok); } } else { // some other markup that happened to be 3 tokens, push tokens as-is for (const tok of toks) { output.push(tok); } } } else { // some other tokenized markup, push tokens as-is for (const tok of toks) { output.push(tok); } } // reset the current lazy text buffer textBuffer = ''; } }; // create an incremental HTML parser that tracks HTML tag open, close and text content const parser = new Parser( { oncdatastart() { inCDATA = true; }, oncdataend() { inCDATA = false; }, // when an HTML tag opens... onopentag(name, attrs) { // process any buffered text to be treated as text node before the currently opening HTML tag processTextBuffer(); // push an 'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack output.push({ type: 'tag_open', nesting: 1, meta: { tag: 'html-tag', attributes: [ { type: 'attribute', name: 'name', value: name }, { type: 'attribute', name: 'attrs', value: attrs }, ], }, } as Token); }, ontext(content: string | null | undefined) { if (inCDATA) { // ignore entirely while inside CDATA return; } // only accumulate text into the buffer if we're not under an ignored HTML element if (typeof content === 'string') { appendText(content); } }, // when an HTML tag closes... onclosetag(name) { // process any buffered text to be treated as a text node inside the currently closing HTML tag processTextBuffer(); // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack output.push({ type: 'tag_close', nesting: -1, meta: { tag: 'html-tag', attributes: [{ type: 'attribute', name: 'name', value: name }], }, } as Token); }, }, { decodeEntities: false, recognizeCDATA: true, recognizeSelfClosing: true, }, ); // for every detected token... for (const token of tokens) { // if it was an HTML token, write the HTML text into the HTML parser if (token.type.startsWith('html')) { // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack parser.write(token.content); // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML continue; } // process any child content for HTML if (token.type === 'inline') { if (token.children) { token.children = htmlTokenTransform(tokenizer, token.children); } } // not an HTML Token, preserve it at the current stack location output.push(token); } // process any remaining buffered text processTextBuffer(); // // post-process the current levels output Token[] array to un-wind this pattern: // // [ // { type: tag_open, meta.tag: html-tag }, // { type: paragraph_open }, // { type: inline, children [...] }, // { type: paragraph_close }, // { type: tag_close, meta.tag: html-tag } // ] // // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node // // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled // mutateAndCollapseExtraParagraphsUnderHtml(output); return output; } function mutateAndCollapseExtraParagraphsUnderHtml(tokens: Token[]): void { let done = false; while (!done) { const idx = findExtraParagraphUnderHtml(tokens); if (typeof idx === 'number') { // mutate const actualChildTokens = tokens[idx + 2].children ?? []; tokens.splice(idx, 5, ...actualChildTokens); } else { done = true; } } } /** * * @param token * @returns */ function findExtraParagraphUnderHtml(tokens: Token[]): number | null { if (tokens.length < 5) { return null; } for (let i = 0; i < tokens.length; i++) { const last = i + 4; if (last > tokens.length - 1) { break; // early exit, no more possible 5-long slices to search } const slice = tokens.slice(i, last + 1); const isMatch = isExtraParagraphPatternMatch(slice); if (isMatch) { return i; } } return null; } function isExtraParagraphPatternMatch(slice: Token[]): boolean { const match = isHtmlTagOpen(slice[0]) && isParagraphOpen(slice[1]) && isInline(slice[2]) && isParagraphClose(slice[3]) && isHtmlTagClose(slice[4]); return match; } function isHtmlTagOpen(token: Token): boolean { return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag'; } function isHtmlTagClose(token: Token): boolean { return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag'; } function isParagraphOpen(token: Token): boolean { return token.type === 'paragraph_open'; } function isParagraphClose(token: Token): boolean { return token.type === 'paragraph_close'; } function isInline(token: Token): boolean { return token.type === 'inline'; }