aboutsummaryrefslogtreecommitdiff
path: root/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
diff options
context:
space:
mode:
Diffstat (limited to 'packages/integrations/markdoc/src/html/transform/html-token-transform.ts')
-rw-r--r--packages/integrations/markdoc/src/html/transform/html-token-transform.ts437
1 files changed, 214 insertions, 223 deletions
diff --git a/packages/integrations/markdoc/src/html/transform/html-token-transform.ts b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
index 6b2838ac3..0d5dcfb81 100644
--- a/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
+++ b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
@@ -1,256 +1,247 @@
-import type * as Token from 'markdown-it/lib/token';
-import { Parser } from 'htmlparser2';
import { Tokenizer } from '@markdoc/markdoc';
-
+import { Parser } from 'htmlparser2';
+import type * as Token from 'markdown-it/lib/token';
export function htmlTokenTransform(tokenizer: Tokenizer, tokens: Token[]): Token[] {
-
- const output: Token[] = [];
-
- // hold a lazy buffer of text and process it only when necessary
- let textBuffer = '';
-
- let inCDATA = false;
-
- const appendText = (text: string) => {
- textBuffer += text;
- };
-
- // process the current text buffer w/ Markdoc's Tokenizer for tokens
- const processTextBuffer = () => {
-
- if (textBuffer.length > 0) {
-
- // tokenize the text buffer to look for structural markup tokens
- const toks = tokenizer.tokenize(textBuffer);
-
- // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want
- // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup
- if (toks.length === 3) {
-
- const first = toks[0];
- const second = toks[1];
- const third: Token | undefined = toks.at(2);
-
- if (first.type === 'paragraph_open' && second.type === 'inline' && (third && third.type === 'paragraph_close') && Array.isArray(second.children)) {
- for (const tok of second.children as Token[]) {
- // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original
- // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content
- if (tok.type === 'text') {
- if (tok.content.trim() == textBuffer.trim()) {
- tok.content = textBuffer;
- }
- }
- output.push(tok);
- }
- } else {
- // some other markup that happened to be 3 tokens, push tokens as-is
- for (const tok of toks) {
- output.push(tok);
- }
- }
- } else {
- // some other tokenized markup, push tokens as-is
- for (const tok of toks) {
- output.push(tok);
- }
- }
-
- // reset the current lazy text buffer
- textBuffer = '';
- }
- };
-
- // create an incremental HTML parser that tracks HTML tag open, close and text content
- const parser = new Parser({
-
- oncdatastart() {
- inCDATA = true;
- },
-
- oncdataend() {
- inCDATA = false;
- },
-
- // when an HTML tag opens...
- onopentag(name, attrs) {
-
- // process any buffered text to be treated as text node before the currently opening HTML tag
- processTextBuffer();
-
- // push an 'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack
- output.push({
- type: 'tag_open',
- nesting: 1,
- meta: {
- tag: 'html-tag',
- attributes: [
- { type: 'attribute', name: 'name', value: name },
- { type: 'attribute', name: 'attrs', value: attrs },
- ],
- },
- } as Token);
-
- },
-
- ontext(content: string | null | undefined) {
-
- if (inCDATA) {
- // ignore entirely while inside CDATA
- return;
- }
-
- // only accumulate text into the buffer if we're not under an ignored HTML element
- if (typeof content === 'string') {
- appendText(content);
- }
- },
-
- // when an HTML tag closes...
- onclosetag(name) {
-
- // process any buffered text to be treated as a text node inside the currently closing HTML tag
- processTextBuffer();
-
- // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack
- output.push({
- type: 'tag_close',
- nesting: -1,
- meta: {
- tag: 'html-tag',
- attributes: [
- { type: 'attribute', name: 'name', value: name },
- ],
- },
- } as Token);
-
- },
-
- }, {
- decodeEntities: false,
- recognizeCDATA: true,
- recognizeSelfClosing: true,
- });
-
- // for every detected token...
- for (const token of tokens) {
-
- // if it was an HTML token, write the HTML text into the HTML parser
- if (token.type.startsWith('html')) {
-
- // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack
- parser.write(token.content);
-
- // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML
- continue;
- }
-
- // process any child content for HTML
- if (token.type === 'inline') {
- if (token.children) {
- token.children = htmlTokenTransform(tokenizer, token.children);
- }
- }
-
- // not an HTML Token, preserve it at the current stack location
- output.push(token);
- }
-
- // process any remaining buffered text
- processTextBuffer();
-
- //
- // post-process the current levels output Token[] array to un-wind this pattern:
- //
- // [
- // { type: tag_open, meta.tag: html-tag },
- // { type: paragraph_open },
- // { type: inline, children [...] },
- // { type: paragraph_close },
- // { type: tag_close, meta.tag: html-tag }
- // ]
- //
- // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node
- //
- // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled
- //
-
- mutateAndCollapseExtraParagraphsUnderHtml(output);
-
- return output;
+ const output: Token[] = [];
+
+ // hold a lazy buffer of text and process it only when necessary
+ let textBuffer = '';
+
+ let inCDATA = false;
+
+ const appendText = (text: string) => {
+ textBuffer += text;
+ };
+
+ // process the current text buffer w/ Markdoc's Tokenizer for tokens
+ const processTextBuffer = () => {
+ if (textBuffer.length > 0) {
+ // tokenize the text buffer to look for structural markup tokens
+ const toks = tokenizer.tokenize(textBuffer);
+
+ // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want
+ // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup
+ if (toks.length === 3) {
+ const first = toks[0];
+ const second = toks[1];
+ const third: Token | undefined = toks.at(2);
+
+ if (
+ first.type === 'paragraph_open' &&
+ second.type === 'inline' &&
+ third &&
+ third.type === 'paragraph_close' &&
+ Array.isArray(second.children)
+ ) {
+ for (const tok of second.children as Token[]) {
+ // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original
+ // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content
+ if (tok.type === 'text') {
+ if (tok.content.trim() == textBuffer.trim()) {
+ tok.content = textBuffer;
+ }
+ }
+ output.push(tok);
+ }
+ } else {
+ // some other markup that happened to be 3 tokens, push tokens as-is
+ for (const tok of toks) {
+ output.push(tok);
+ }
+ }
+ } else {
+ // some other tokenized markup, push tokens as-is
+ for (const tok of toks) {
+ output.push(tok);
+ }
+ }
+
+ // reset the current lazy text buffer
+ textBuffer = '';
+ }
+ };
+
+ // create an incremental HTML parser that tracks HTML tag open, close and text content
+ const parser = new Parser(
+ {
+ oncdatastart() {
+ inCDATA = true;
+ },
+
+ oncdataend() {
+ inCDATA = false;
+ },
+
+ // when an HTML tag opens...
+ onopentag(name, attrs) {
+ // process any buffered text to be treated as text node before the currently opening HTML tag
+ processTextBuffer();
+
+ // push an 'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack
+ output.push({
+ type: 'tag_open',
+ nesting: 1,
+ meta: {
+ tag: 'html-tag',
+ attributes: [
+ { type: 'attribute', name: 'name', value: name },
+ { type: 'attribute', name: 'attrs', value: attrs },
+ ],
+ },
+ } as Token);
+ },
+
+ ontext(content: string | null | undefined) {
+ if (inCDATA) {
+ // ignore entirely while inside CDATA
+ return;
+ }
+
+ // only accumulate text into the buffer if we're not under an ignored HTML element
+ if (typeof content === 'string') {
+ appendText(content);
+ }
+ },
+
+ // when an HTML tag closes...
+ onclosetag(name) {
+ // process any buffered text to be treated as a text node inside the currently closing HTML tag
+ processTextBuffer();
+
+ // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack
+ output.push({
+ type: 'tag_close',
+ nesting: -1,
+ meta: {
+ tag: 'html-tag',
+ attributes: [{ type: 'attribute', name: 'name', value: name }],
+ },
+ } as Token);
+ },
+ },
+ {
+ decodeEntities: false,
+ recognizeCDATA: true,
+ recognizeSelfClosing: true,
+ }
+ );
+
+ // for every detected token...
+ for (const token of tokens) {
+ // if it was an HTML token, write the HTML text into the HTML parser
+ if (token.type.startsWith('html')) {
+ // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack
+ parser.write(token.content);
+
+ // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML
+ continue;
+ }
+
+ // process any child content for HTML
+ if (token.type === 'inline') {
+ if (token.children) {
+ token.children = htmlTokenTransform(tokenizer, token.children);
+ }
+ }
+
+ // not an HTML Token, preserve it at the current stack location
+ output.push(token);
+ }
+
+ // process any remaining buffered text
+ processTextBuffer();
+
+ //
+ // post-process the current levels output Token[] array to un-wind this pattern:
+ //
+ // [
+ // { type: tag_open, meta.tag: html-tag },
+ // { type: paragraph_open },
+ // { type: inline, children [...] },
+ // { type: paragraph_close },
+ // { type: tag_close, meta.tag: html-tag }
+ // ]
+ //
+ // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node
+ //
+ // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled
+ //
+
+ mutateAndCollapseExtraParagraphsUnderHtml(output);
+
+ return output;
}
function mutateAndCollapseExtraParagraphsUnderHtml(tokens: Token[]): void {
- let done = false;
+ let done = false;
- while (!done) {
- const idx = findExtraParagraphUnderHtml(tokens);
- if (typeof idx === 'number') {
- // mutate
+ while (!done) {
+ const idx = findExtraParagraphUnderHtml(tokens);
+ if (typeof idx === 'number') {
+ // mutate
- const actualChildTokens = tokens[idx + 2].children ?? [];
+ const actualChildTokens = tokens[idx + 2].children ?? [];
- tokens.splice(idx, 5, ...actualChildTokens);
- } else {
- done = true;
- }
- }
+ tokens.splice(idx, 5, ...actualChildTokens);
+ } else {
+ done = true;
+ }
+ }
}
-
/**
- *
- * @param token
- * @returns
+ *
+ * @param token
+ * @returns
*/
function findExtraParagraphUnderHtml(tokens: Token[]): number | null {
-
- if (tokens.length < 5) {
- return null;
- }
-
- for (let i = 0; i < tokens.length; i++) {
- const last = i + 4;
- if (last > tokens.length - 1) {
- break; // early exit, no more possible 5-long slices to search
- }
-
- const slice = tokens.slice(i, last + 1);
- const isMatch = isExtraParagraphPatternMatch(slice);
- if (isMatch) {
- return i;
- }
- }
-
- return null;
+ if (tokens.length < 5) {
+ return null;
+ }
+
+ for (let i = 0; i < tokens.length; i++) {
+ const last = i + 4;
+ if (last > tokens.length - 1) {
+ break; // early exit, no more possible 5-long slices to search
+ }
+
+ const slice = tokens.slice(i, last + 1);
+ const isMatch = isExtraParagraphPatternMatch(slice);
+ if (isMatch) {
+ return i;
+ }
+ }
+
+ return null;
}
function isExtraParagraphPatternMatch(slice: Token[]): boolean {
- const match = isHtmlTagOpen(slice[0])
- && isParagraphOpen(slice[1])
- && isInline(slice[2])
- && isParagraphClose(slice[3])
- && isHtmlTagClose(slice[4]);
- return match;
+ const match =
+ isHtmlTagOpen(slice[0]) &&
+ isParagraphOpen(slice[1]) &&
+ isInline(slice[2]) &&
+ isParagraphClose(slice[3]) &&
+ isHtmlTagClose(slice[4]);
+ return match;
}
-
function isHtmlTagOpen(token: Token): boolean {
- return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag';
+ return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag';
}
function isHtmlTagClose(token: Token): boolean {
- return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag';
+ return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag';
}
function isParagraphOpen(token: Token): boolean {
- return token.type === 'paragraph_open';
+ return token.type === 'paragraph_open';
}
function isParagraphClose(token: Token): boolean {
- return token.type === 'paragraph_close';
+ return token.type === 'paragraph_close';
}
function isInline(token: Token): boolean {
- return token.type === 'inline';
+ return token.type === 'inline';
}