packages/integrations/markdoc/src/html/transform/html-token-transform.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

import type { Tokenizer } from '@markdoc/markdoc';
import { Parser } from 'htmlparser2';
// @ts-expect-error This type isn't exported
import type * as Token from 'markdown-it/lib/token';

export function htmlTokenTransform(tokenizer: Tokenizer, tokens: Token[]): Token[] {
	const output: Token[] = [];

	// hold a lazy buffer of text and process it only when necessary
	let textBuffer = '';

	let inCDATA = false;

	const appendText = (text: string) => {
		textBuffer += text;
	};

	// process the current text buffer w/ Markdoc's Tokenizer for tokens
	const processTextBuffer = () => {
		if (textBuffer.length > 0) {
			// tokenize the text buffer to look for structural markup tokens
			const toks = tokenizer.tokenize(textBuffer);

			// when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want
			// in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup
			if (toks.length === 3) {
				const first = toks[0];
				const second = toks[1];
				const third: Token | undefined = toks.at(2);

				if (
					first.type === 'paragraph_open' &&
					second.type === 'inline' &&
					third &&
					third.type === 'paragraph_close' &&
					Array.isArray(second.children)
				) {
					for (const tok of second.children as Token[]) {
						// if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original
						// text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content
						if (tok.type === 'text') {
							if (tok.content.trim() == textBuffer.trim()) {
								tok.content = textBuffer;
							}
						}
						output.push(tok);
					}
				} else {
					// some other markup that happened to be 3 tokens, push tokens as-is
					for (const tok of toks) {
						output.push(tok);
					}
				}
			} else {
				// some other tokenized markup, push tokens as-is
				for (const tok of toks) {
					output.push(tok);
				}
			}

			// reset the current lazy text buffer
			textBuffer = '';
		}
	};

	// create an incremental HTML parser that tracks HTML tag open, close and text content
	const parser = new Parser(
		{
			oncdatastart() {
				inCDATA = true;
			},

			oncdataend() {
				inCDATA = false;
			},

			// when an HTML tag opens...
			onopentag(name, attrs) {
				// process any buffered text to be treated as text node before the currently opening HTML tag
				processTextBuffer();

				// push an  'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack
				output.push({
					type: 'tag_open',
					nesting: 1,
					meta: {
						tag: 'html-tag',
						attributes: [
							{ type: 'attribute', name: 'name', value: name },
							{ type: 'attribute', name: 'attrs', value: attrs },
						],
					},
				} as Token);
			},

			ontext(content: string | null | undefined) {
				if (inCDATA) {
					// ignore entirely while inside CDATA
					return;
				}

				// only accumulate text into the buffer if we're not under an ignored HTML element
				if (typeof content === 'string') {
					appendText(content);
				}
			},

			// when an HTML tag closes...
			onclosetag(name) {
				// process any buffered text to be treated as a text node inside the currently closing HTML tag
				processTextBuffer();

				// push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack
				output.push({
					type: 'tag_close',
					nesting: -1,
					meta: {
						tag: 'html-tag',
						attributes: [{ type: 'attribute', name: 'name', value: name }],
					},
				} as Token);
			},
		},
		{
			decodeEntities: false,
			recognizeCDATA: true,
			recognizeSelfClosing: true,
		}
	);

	// for every detected token...
	for (const token of tokens) {
		// if it was an HTML token, write the HTML text into the HTML parser
		if (token.type.startsWith('html')) {
			// as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack
			parser.write(token.content);

			// continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML
			continue;
		}

		// process any child content for HTML
		if (token.type === 'inline') {
			if (token.children) {
				token.children = htmlTokenTransform(tokenizer, token.children);
			}
		}

		// not an HTML Token, preserve it at the current stack location
		output.push(token);
	}

	// process any remaining buffered text
	processTextBuffer();

	//
	// post-process the current levels output Token[] array to un-wind this pattern:
	//
	// [
	//   { type: tag_open, meta.tag: html-tag },
	//   { type: paragraph_open },
	//   { type: inline, children [...] },
	//   { type: paragraph_close },
	//   { type: tag_close, meta.tag: html-tag }
	// ]
	//
	// the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node
	//
	// this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled
	//

	mutateAndCollapseExtraParagraphsUnderHtml(output);

	return output;
}

function mutateAndCollapseExtraParagraphsUnderHtml(tokens: Token[]): void {
	let done = false;

	while (!done) {
		const idx = findExtraParagraphUnderHtml(tokens);
		if (typeof idx === 'number') {
			// mutate

			const actualChildTokens = tokens[idx + 2].children ?? [];

			tokens.splice(idx, 5, ...actualChildTokens);
		} else {
			done = true;
		}
	}
}

/**
 *
 * @param token
 * @returns
 */
function findExtraParagraphUnderHtml(tokens: Token[]): number | null {
	if (tokens.length < 5) {
		return null;
	}

	for (let i = 0; i < tokens.length; i++) {
		const last = i + 4;
		if (last > tokens.length - 1) {
			break; // early exit, no more possible 5-long slices to search
		}

		const slice = tokens.slice(i, last + 1);
		const isMatch = isExtraParagraphPatternMatch(slice);
		if (isMatch) {
			return i;
		}
	}

	return null;
}

function isExtraParagraphPatternMatch(slice: Token[]): boolean {
	const match =
		isHtmlTagOpen(slice[0]) &&
		isParagraphOpen(slice[1]) &&
		isInline(slice[2]) &&
		isParagraphClose(slice[3]) &&
		isHtmlTagClose(slice[4]);
	return match;
}

function isHtmlTagOpen(token: Token): boolean {
	return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag';
}

function isHtmlTagClose(token: Token): boolean {
	return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag';
}

function isParagraphOpen(token: Token): boolean {
	return token.type === 'paragraph_open';
}

function isParagraphClose(token: Token): boolean {
	return token.type === 'paragraph_close';
}

function isInline(token: Token): boolean {
	return token.type === 'inline';
}