Add "allowHTML" option for Markdoc with HTML parsing/processing (#7597)

* 7576 - initial support for HTML inside Markdoc. This uses htmlparser2 to perform a pure token transform/mutation on the markdown-it tokens, replacing the original raw HTML string tokens with a richer set of tokens per HTML node, and in the process Markdoc tags are interleaved in the resulting token graph at the appropriate locations This removes the legacy config of the @astrojs/markdoc integration entirely (suggested by @bholmesdev) and introduces a new type for options to be specified in the astro config, initially, with just the new "enableHTML" option When "enableHTML" is *not* enabled (the default), the behavior of the entire @astrojs/markdoc integration should remain functionally equivalent to before this change * 7576 - fixed issues with whitespace preservation also: * cleaned up " to ' for astro project preferred linting * made the html rendering test fixture use a dynamic path * 7576 - detailed nested HTML test coverage * 7576 - component + HTML interleaved tests * 7576 - fix lint problems from previous changes * 7576 - some commentary * 7576 - file naming, refactor html under imports, package.json exports definition for html * 7576 * move out of extensions dir, remove export * cdata handling changes * 7576 * inline license from third party code * cleanup test class copy of HTML output * remove // third party indicators for imports (clarification: not third party code, just a indicator this group of imports is third party) * 7576 - fixed test before/after for DRY'ness * 7576 - no need to React-ify HTML attribute case * 7576 - rename "enableHTML" option to "allowHTML" * Added Markdoc allowHTML feature changeset * 7576 - updated README with allowHTML info * 7576 - fixed changeset typo * 7576 - minor edits based on PR feedback for docs * 7576 - minor edits based on PR feedback for docs
author: Alex Sherwin <alex-sherwin@users.noreply.github.com> 2023-07-24 19:34:06 -0400
committer: GitHub <noreply@github.com> 2023-07-24 19:34:06 -0400
commit: 7461e82c81438df956861197536f9ceeaf63d6b3 (patch)
tree: 5d5318b0f8fd61eba466cbccb00c8e5035914b57 /packages/integrations/markdoc/src
parent: 81c460e30b3318b9727a609390243f42e112ad24 (diff)
download: astro-7461e82c81438df956861197536f9ceeaf63d6b3.tar.gz
astro-7461e82c81438df956861197536f9ceeaf63d6b3.tar.zst
astro-7461e82c81438df956861197536f9ceeaf63d6b3.zip
11 files changed, 752 insertions, 28 deletions
diff --git a/packages/integrations/markdoc/src/content-entry-type.ts b/packages/integrations/markdoc/src/content-entry-type.ts
index bb62a2035..8cbe38c92 100644
--- a/packages/integrations/markdoc/src/content-entry-type.ts
+++ b/packages/integrations/markdoc/src/content-entry-type.ts
@@ -13,13 +13,19 @@ import path from 'node:path';
 import type * as rollup from 'rollup';
 import type { MarkdocConfigResult } from './load-config.js';
 import { setupConfig } from './runtime.js';
+import { getMarkdocTokenizer } from './tokenizer.js';
+import type { MarkdocIntegrationOptions } from './options.js';
+import { htmlTokenTransform } from './html/transform/html-token-transform.js';
 
 export async function getContentEntryType({
 	markdocConfigResult,
 	astroConfig,
+  options,
 }: {
 	astroConfig: AstroConfig;
 	markdocConfigResult?: MarkdocConfigResult;
+  options?: MarkdocIntegrationOptions,
+
 }): Promise<ContentEntryType> {
 	return {
 		extensions: ['.mdoc'],
@@ -27,7 +33,13 @@ export async function getContentEntryType({
 		handlePropagation: true,
 		async getRenderModule({ contents, fileUrl, viteId }) {
 			const entry = getEntryInfo({ contents, fileUrl });
-			const tokens = markdocTokenizer.tokenize(entry.body);
+      const tokenizer = getMarkdocTokenizer(options);
+			let tokens = tokenizer.tokenize(entry.body);
+
+      if (options?.allowHTML) {
+        tokens = htmlTokenTransform(tokenizer, tokens);
+      }
+
 			const ast = Markdoc.parse(tokens);
 			const usedTags = getUsedTags(ast);
 			const userMarkdocConfig = markdocConfigResult?.config ?? {};
@@ -51,7 +63,7 @@ export async function getContentEntryType({
 			}
 
 			const pluginContext = this;
-			const markdocConfig = await setupConfig(userMarkdocConfig);
+			const markdocConfig = await setupConfig(userMarkdocConfig, options);
 
 			const filePath = fileURLToPath(fileUrl);
 
@@ -113,15 +125,18 @@ ${getStringifiedImports(componentConfigByNodeMap, 'Node', astroConfig.root)}
 const tagComponentMap = ${getStringifiedMap(componentConfigByTagMap, 'Tag')};
 const nodeComponentMap = ${getStringifiedMap(componentConfigByNodeMap, 'Node')};
 
+const options = ${JSON.stringify(options)};
+
 const stringifiedAst = ${JSON.stringify(
 				/* Double stringify to encode *as* stringified JSON */ JSON.stringify(ast)
 			)};
 
-export const getHeadings = createGetHeadings(stringifiedAst, markdocConfig);
+export const getHeadings = createGetHeadings(stringifiedAst, markdocConfig, options);
 export const Content = createContentComponent(
 	Renderer,
 	stringifiedAst,
 	markdocConfig,
+  options,
 	tagComponentMap,
 	nodeComponentMap,
 )`;
@@ -134,12 +149,6 @@ export const Content = createContentComponent(
 	};
 }
 
-const markdocTokenizer = new Markdoc.Tokenizer({
-	// Strip <!-- comments --> from rendered output
-	// Without this, they're rendered as strings!
-	allowComments: true,
-});
-
 function getUsedTags(markdocAst: Node) {
 	const tags = new Set<string>();
 	const validationErrors = Markdoc.validate(markdocAst);
diff --git a/packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts b/packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts
new file mode 100644
index 000000000..3b67f9a32
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts
@@ -0,0 +1,23 @@
+
+import { styleToObject } from "./style-to-object.js";
+
+export function parseInlineCSSToReactLikeObject(css: string | undefined | null): React.CSSProperties | undefined {
+  if (typeof css === "string") {
+    const cssObject: Record<string, string> = {};
+    styleToObject(css, (originalCssDirective: string, value: string) => {
+      const reactCssDirective = convertCssDirectiveNameToReactCamelCase(originalCssDirective);
+      cssObject[reactCssDirective] = value;
+    });
+    return cssObject;
+  }
+
+  return undefined;
+}
+
+function convertCssDirectiveNameToReactCamelCase(original: string): string {
+  // capture group 1 is the character to capitalize, the hyphen is omitted by virtue of being outside the capture group
+  const replaced = original.replace(/-([a-z0-9])/ig, (_match, char) => {
+    return char.toUpperCase();
+  });
+  return replaced;
+}
diff --git a/packages/integrations/markdoc/src/html/css/parse-inline-styles.ts b/packages/integrations/markdoc/src/html/css/parse-inline-styles.ts
new file mode 100644
index 000000000..084ce546a
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/css/parse-inline-styles.ts
@@ -0,0 +1,278 @@
+// @ts-nocheck
+// https://github.com/remarkablemark/inline-style-parser
+
+/**
+ * @license MIT
+ * 
+ * (The MIT License)
+ * 
+ * Copyright (c) 2012 TJ Holowaychuk <tj@vision-media.ca>
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// http://www.w3.org/TR/CSS21/grammar.html
+// https://github.com/visionmedia/css-parse/pull/49#issuecomment-30088027
+const COMMENT_REGEX = /\/\*[^*]*\*+([^/*][^*]*\*+)*\//g;
+
+const NEWLINE_REGEX = /\n/g;
+const WHITESPACE_REGEX = /^\s*/;
+
+// declaration
+const PROPERTY_REGEX = /^(\*?[-#/*\\\w]+(\[[0-9a-z_-]+\])?)\s*/;
+const COLON_REGEX = /^:\s*/;
+const VALUE_REGEX = /^((?:'(?:\\'|.)*?'|"(?:\\"|.)*?"|\([^)]*?\)|[^};])+)/;
+const SEMICOLON_REGEX = /^[;\s]*/;
+
+// https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String/Trim#Polyfill
+const TRIM_REGEX = /^\s+|\s+$/g;
+
+// strings
+const NEWLINE = '\n';
+const FORWARD_SLASH = '/';
+const ASTERISK = '*';
+const EMPTY_STRING = '';
+
+// types
+const TYPE_COMMENT = 'comment';
+const TYPE_DECLARATION = 'declaration';
+
+/**
+ * @param {String} style
+ * @param {Object} [options]
+ * @return {Object[]}
+ * @throws {TypeError}
+ * @throws {Error}
+ */
+export function parseInlineStyles(style, options) {
+  if (typeof style !== 'string') {
+    throw new TypeError('First argument must be a string');
+  }
+
+  if (!style) return [];
+
+  options = options || {};
+
+  /**
+   * Positional.
+   */
+  let lineno = 1;
+  let column = 1;
+
+  /**
+   * Update lineno and column based on `str`.
+   *
+   * @param {String} str
+   */
+  function updatePosition(str) {
+    let lines = str.match(NEWLINE_REGEX);
+    if (lines) lineno += lines.length;
+    let i = str.lastIndexOf(NEWLINE);
+    column = ~i ? str.length - i : column + str.length;
+  }
+
+  /**
+   * Mark position and patch `node.position`.
+   *
+   * @return {Function}
+   */
+  function position() {
+    let start = { line: lineno, column: column };
+    return function (node) {
+      node.position = new Position(start);
+      whitespace();
+      return node;
+    };
+  }
+
+  /**
+   * Store position information for a node.
+   *
+   * @constructor
+   * @property {Object} start
+   * @property {Object} end
+   * @property {undefined|String} source
+   */
+  function Position(start) {
+    this.start = start;
+    this.end = { line: lineno, column: column };
+    this.source = options.source;
+  }
+
+  /**
+   * Non-enumerable source string.
+   */
+  Position.prototype.content = style;
+
+  const errorsList = [];
+
+  /**
+   * Error `msg`.
+   *
+   * @param {String} msg
+   * @throws {Error}
+   */
+  function error(msg) {
+    const err = new Error(
+      options.source + ':' + lineno + ':' + column + ': ' + msg
+    );
+    err.reason = msg;
+    err.filename = options.source;
+    err.line = lineno;
+    err.column = column;
+    err.source = style;
+
+    if (options.silent) {
+      errorsList.push(err);
+    } else {
+      throw err;
+    }
+  }
+
+  /**
+   * Match `re` and return captures.
+   *
+   * @param {RegExp} re
+   * @return {undefined|Array}
+   */
+  function match(re) {
+    const m = re.exec(style);
+    if (!m) return;
+    const str = m[0];
+    updatePosition(str);
+    style = style.slice(str.length);
+    return m;
+  }
+
+  /**
+   * Parse whitespace.
+   */
+  function whitespace() {
+    match(WHITESPACE_REGEX);
+  }
+
+  /**
+   * Parse comments.
+   *
+   * @param {Object[]} [rules]
+   * @return {Object[]}
+   */
+  function comments(rules) {
+    let c;
+    rules = rules || [];
+    while ((c = comment())) {
+      if (c !== false) {
+        rules.push(c);
+      }
+    }
+    return rules;
+  }
+
+  /**
+   * Parse comment.
+   *
+   * @return {Object}
+   * @throws {Error}
+   */
+  function comment() {
+    const pos = position();
+    if (FORWARD_SLASH != style.charAt(0) || ASTERISK != style.charAt(1)) return;
+
+    let i = 2;
+    while (
+      EMPTY_STRING != style.charAt(i) &&
+      (ASTERISK != style.charAt(i) || FORWARD_SLASH != style.charAt(i + 1))
+    ) {
+      ++i;
+    }
+    i += 2;
+
+    if (EMPTY_STRING === style.charAt(i - 1)) {
+      return error('End of comment missing');
+    }
+
+    const str = style.slice(2, i - 2);
+    column += 2;
+    updatePosition(str);
+    style = style.slice(i);
+    column += 2;
+
+    return pos({
+      type: TYPE_COMMENT,
+      comment: str
+    });
+  }
+
+  /**
+   * Parse declaration.
+   *
+   * @return {Object}
+   * @throws {Error}
+   */
+  function declaration() {
+    const pos = position();
+
+    // prop
+    const prop = match(PROPERTY_REGEX);
+    if (!prop) return;
+    comment();
+
+    // :
+    if (!match(COLON_REGEX)) return error("property missing ':'");
+
+    // val
+    const val = match(VALUE_REGEX);
+
+    const ret = pos({
+      type: TYPE_DECLARATION,
+      property: trim(prop[0].replace(COMMENT_REGEX, EMPTY_STRING)),
+      value: val
+        ? trim(val[0].replace(COMMENT_REGEX, EMPTY_STRING))
+        : EMPTY_STRING
+    });
+
+    // ;
+    match(SEMICOLON_REGEX);
+
+    return ret;
+  }
+
+  /**
+   * Parse declarations.
+   *
+   * @return {Object[]}
+   */
+  function declarations() {
+    const decls = [];
+
+    comments(decls);
+
+    // declarations
+    let decl;
+    while ((decl = declaration())) {
+      if (decl !== false) {
+        decls.push(decl);
+        comments(decls);
+      }
+    }
+
+    return decls;
+  }
+
+  whitespace();
+  return declarations();
+};
+
+/**
+ * Trim `str`.
+ *
+ * @param {String} str
+ * @return {String}
+ */
+function trim(str) {
+  return str ? str.replace(TRIM_REGEX, EMPTY_STRING) : EMPTY_STRING;
+}
+\ No newline at end of file
diff --git a/packages/integrations/markdoc/src/html/css/style-to-object.ts b/packages/integrations/markdoc/src/html/css/style-to-object.ts
new file mode 100644
index 000000000..7febe3152
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/css/style-to-object.ts
@@ -0,0 +1,70 @@
+// @ts-nocheck
+// https://github.com/remarkablemark/style-to-object
+
+/**
+ * @license MIT
+ * 
+ * The MIT License (MIT)
+ * 
+ * Copyright (c) 2017 Menglin "Mark" Xu <mark@remarkablemark.org>
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+import { parseInlineStyles } from "./parse-inline-styles.js";
+
+/**
+ * Parses inline style to object.
+ *
+ * @example
+ * // returns { 'line-height': '42' }
+ * styleToObject('line-height: 42;');
+ *
+ * @param  {String}      style      - The inline style.
+ * @param  {Function}    [iterator] - The iterator function.
+ * @return {null|Object}
+ */
+export function styleToObject(style, iterator) {
+  let output = null;
+  if (!style || typeof style !== 'string') {
+    return output;
+  }
+
+  let declaration;
+  let declarations = parseInlineStyles(style);
+  let hasIterator = typeof iterator === 'function';
+  let property;
+  let value;
+
+  for (let i = 0, len = declarations.length; i < len; i++) {
+    declaration = declarations[i];
+    property = declaration.property;
+    value = declaration.value;
+
+    if (hasIterator) {
+      iterator(property, value, declaration);
+    } else if (value) {
+      output || (output = {});
+      output[property] = value;
+    }
+  }
+
+  return output;
+}
diff --git a/packages/integrations/markdoc/src/html/index.ts b/packages/integrations/markdoc/src/html/index.ts
new file mode 100644
index 000000000..a456777f3
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/index.ts
@@ -0,0 +1,2 @@
+export { htmlTokenTransform } from "./transform/html-token-transform";
+export { htmlTag } from "./tagdefs/html.tag";
diff --git a/packages/integrations/markdoc/src/html/tagdefs/html.tag.ts b/packages/integrations/markdoc/src/html/tagdefs/html.tag.ts
new file mode 100644
index 000000000..ecbeddbc3
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/tagdefs/html.tag.ts
@@ -0,0 +1,32 @@
+import type { Config, Schema } from "@markdoc/markdoc";
+import Markdoc from "@markdoc/markdoc";
+
+// local
+import { parseInlineCSSToReactLikeObject } from "../css/parse-inline-css-to-react.js";
+
+// a Markdoc tag that will render a given HTML element and its attributes, as produced by the htmlTokenTransform function
+export const htmlTag: Schema<Config, never> = {
+
+  attributes: {
+    name: { type: String, required: true },
+    attrs: { type: Object },
+  },
+
+  transform(node, config) {
+
+    const { name, attrs: unsafeAttributes } = node.attributes;
+    const children = node.transformChildren(config);
+
+    // pull out any "unsafe" attributes which need additional processing
+    const { style, ...safeAttributes } = unsafeAttributes as Record<string, unknown>;
+
+    // if the inline "style" attribute is present we need to parse the HTML into a react-like React.CSSProperties object
+    if (typeof style === "string") {
+      const styleObject = parseInlineCSSToReactLikeObject(style);
+      safeAttributes.style = styleObject;
+    }
+
+    // create a Markdoc Tag for the given HTML node with the HTML attributes and children
+    return new Markdoc.Tag(name, safeAttributes, children);
+  },
+};
diff --git a/packages/integrations/markdoc/src/html/transform/html-token-transform.ts b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
new file mode 100644
index 000000000..6b2838ac3
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
@@ -0,0 +1,256 @@
+import type * as Token from 'markdown-it/lib/token';
+import { Parser } from 'htmlparser2';
+import { Tokenizer } from '@markdoc/markdoc';
+
+
+export function htmlTokenTransform(tokenizer: Tokenizer, tokens: Token[]): Token[] {
+
+  const output: Token[] = [];
+
+  // hold a lazy buffer of text and process it only when necessary
+  let textBuffer = '';
+
+  let inCDATA = false;
+
+  const appendText = (text: string) => {
+    textBuffer += text;
+  };
+
+  // process the current text buffer w/ Markdoc's Tokenizer for tokens
+  const processTextBuffer = () => {
+
+    if (textBuffer.length > 0) {
+
+      // tokenize the text buffer to look for structural markup tokens
+      const toks = tokenizer.tokenize(textBuffer);
+
+      // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want
+      // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup
+      if (toks.length === 3) {
+
+        const first = toks[0];
+        const second = toks[1];
+        const third: Token | undefined = toks.at(2);
+
+        if (first.type === 'paragraph_open' && second.type === 'inline' && (third && third.type === 'paragraph_close') && Array.isArray(second.children)) {
+          for (const tok of second.children as Token[]) {
+            // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original
+            // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content
+            if (tok.type === 'text') {
+              if (tok.content.trim() == textBuffer.trim()) {
+                tok.content = textBuffer;
+              }
+            }
+            output.push(tok);
+          }
+        } else {
+          // some other markup that happened to be 3 tokens, push tokens as-is
+          for (const tok of toks) {
+            output.push(tok);
+          }
+        }
+      } else {
+        // some other tokenized markup, push tokens as-is
+        for (const tok of toks) {
+          output.push(tok);
+        }
+      }
+
+      // reset the current lazy text buffer
+      textBuffer = '';
+    }
+  };
+
+  // create an incremental HTML parser that tracks HTML tag open, close and text content
+  const parser = new Parser({
+
+    oncdatastart() {
+      inCDATA = true;
+    },
+
+    oncdataend() {
+      inCDATA = false;
+    },
+
+    // when an HTML tag opens...
+    onopentag(name, attrs) {
+
+      // process any buffered text to be treated as text node before the currently opening HTML tag
+      processTextBuffer();
+
+      // push an  'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack
+      output.push({
+        type: 'tag_open',
+        nesting: 1,
+        meta: {
+          tag: 'html-tag',
+          attributes: [
+            { type: 'attribute', name: 'name', value: name },
+            { type: 'attribute', name: 'attrs', value: attrs },
+          ],
+        },
+      } as Token);
+
+    },
+
+    ontext(content: string | null | undefined) {
+
+      if (inCDATA) {
+        // ignore entirely while inside CDATA
+        return;
+      }
+
+      // only accumulate text into the buffer if we're not under an ignored HTML element
+      if (typeof content === 'string') {
+        appendText(content);
+      }
+    },
+
+    // when an HTML tag closes...
+    onclosetag(name) {
+
+      // process any buffered text to be treated as a text node inside the currently closing HTML tag
+      processTextBuffer();
+
+      // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack
+      output.push({
+        type: 'tag_close',
+        nesting: -1,
+        meta: {
+          tag: 'html-tag',
+          attributes: [
+            { type: 'attribute', name: 'name', value: name },
+          ],
+        },
+      } as Token);
+
+    },
+
+  }, {
+    decodeEntities: false,
+    recognizeCDATA: true,
+    recognizeSelfClosing: true,
+  });
+
+  // for every detected token...
+  for (const token of tokens) {
+
+    // if it was an HTML token, write the HTML text into the HTML parser
+    if (token.type.startsWith('html')) {
+
+      // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack
+      parser.write(token.content);
+
+      // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML
+      continue;
+    }
+
+    // process any child content for HTML
+    if (token.type === 'inline') {
+      if (token.children) {
+        token.children = htmlTokenTransform(tokenizer, token.children);
+      }
+    }
+
+    // not an HTML Token, preserve it at the current stack location
+    output.push(token);
+  }
+
+  // process any remaining buffered text
+  processTextBuffer();
+
+  //
+  // post-process the current levels output Token[] array to un-wind this pattern:
+  // 
+  // [
+  //   { type: tag_open, meta.tag: html-tag },
+  //   { type: paragraph_open },
+  //   { type: inline, children [...] },
+  //   { type: paragraph_close },
+  //   { type: tag_close, meta.tag: html-tag }
+  // ]
+  // 
+  // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node
+  // 
+  // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled
+  //
+
+  mutateAndCollapseExtraParagraphsUnderHtml(output);
+
+  return output;
+}
+
+function mutateAndCollapseExtraParagraphsUnderHtml(tokens: Token[]): void {
+  let done = false;
+
+  while (!done) {
+    const idx = findExtraParagraphUnderHtml(tokens);
+    if (typeof idx === 'number') {
+      // mutate
+
+      const actualChildTokens = tokens[idx + 2].children ?? [];
+
+      tokens.splice(idx, 5, ...actualChildTokens);
+    } else {
+      done = true;
+    }
+  }
+}
+
+
+/**
+ * 
+ * @param token 
+ * @returns 
+ */
+function findExtraParagraphUnderHtml(tokens: Token[]): number | null {
+
+  if (tokens.length < 5) {
+    return null;
+  }
+
+  for (let i = 0; i < tokens.length; i++) {
+    const last = i + 4;
+    if (last > tokens.length - 1) {
+      break; // early exit, no more possible 5-long slices to search
+    }
+
+    const slice = tokens.slice(i, last + 1);
+    const isMatch = isExtraParagraphPatternMatch(slice);
+    if (isMatch) {
+      return i;
+    }
+  }
+
+  return null;
+}
+
+function isExtraParagraphPatternMatch(slice: Token[]): boolean {
+  const match = isHtmlTagOpen(slice[0])
+    && isParagraphOpen(slice[1])
+    && isInline(slice[2])
+    && isParagraphClose(slice[3])
+    && isHtmlTagClose(slice[4]);
+  return match;
+}
+
+
+function isHtmlTagOpen(token: Token): boolean {
+  return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag';
+}
+
+function isHtmlTagClose(token: Token): boolean {
+  return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag';
+}
+
+function isParagraphOpen(token: Token): boolean {
+  return token.type === 'paragraph_open';
+}
+
+function isParagraphClose(token: Token): boolean {
+  return token.type === 'paragraph_close';
+}
+
+function isInline(token: Token): boolean {
+  return token.type === 'inline';
+}
diff --git a/packages/integrations/markdoc/src/index.ts b/packages/integrations/markdoc/src/index.ts
index 0e2aa52dc..ecb402165 100644
--- a/packages/integrations/markdoc/src/index.ts
+++ b/packages/integrations/markdoc/src/index.ts
@@ -1,12 +1,11 @@
-/* eslint-disable no-console */
 import type { AstroConfig, AstroIntegration, ContentEntryType, HookParameters } from 'astro';
-import { bold, red } from 'kleur/colors';
 import { getContentEntryType } from './content-entry-type.js';
 import {
 	SUPPORTED_MARKDOC_CONFIG_FILES,
 	loadMarkdocConfig,
 	type MarkdocConfigResult,
 } from './load-config.js';
+import type { MarkdocIntegrationOptions } from './options.js';
 
 type SetupHookParams = HookParameters<'astro:config:setup'> & {
 	// `contentEntryType` is not a public API
@@ -14,15 +13,7 @@ type SetupHookParams = HookParameters<'astro:config:setup'> & {
 	addContentEntryType: (contentEntryType: ContentEntryType) => void;
 };
 
-export default function markdocIntegration(legacyConfig?: any): AstroIntegration {
-	if (legacyConfig) {
-		console.log(
-			`${red(
-				bold('[Markdoc]')
-			)} Passing Markdoc config from your \`astro.config\` is no longer supported. Configuration should be exported from a \`markdoc.config.mjs\` file. See the configuration docs for more: https://docs.astro.build/en/guides/integrations-guide/markdoc/#configuration`
-		);
-		process.exit(0);
-	}
+export default function markdocIntegration(options?: MarkdocIntegrationOptions): AstroIntegration {
 	let markdocConfigResult: MarkdocConfigResult | undefined;
 	let astroConfig: AstroConfig;
 	return {
@@ -34,7 +25,7 @@ export default function markdocIntegration(legacyConfig?: any): AstroIntegration
 
 				markdocConfigResult = await loadMarkdocConfig(astroConfig);
 
-				addContentEntryType(await getContentEntryType({ markdocConfigResult, astroConfig }));
+				addContentEntryType(await getContentEntryType({ markdocConfigResult, astroConfig, options }));
 
 				updateConfig({
 					vite: {
diff --git a/packages/integrations/markdoc/src/options.ts b/packages/integrations/markdoc/src/options.ts
new file mode 100644
index 000000000..df54cf9f6
--- /dev/null
+++ b/packages/integrations/markdoc/src/options.ts
@@ -0,0 +1,3 @@
+export interface MarkdocIntegrationOptions {
+  allowHTML?: boolean;
+}
diff --git a/packages/integrations/markdoc/src/runtime.ts b/packages/integrations/markdoc/src/runtime.ts
index 6a20a8740..4b93349b8 100644
--- a/packages/integrations/markdoc/src/runtime.ts
+++ b/packages/integrations/markdoc/src/runtime.ts
@@ -10,13 +10,15 @@ import type { AstroInstance } from 'astro';
 import { createComponent, renderComponent } from 'astro/runtime/server/index.js';
 import type { AstroMarkdocConfig } from './config.js';
 import { setupHeadingConfig } from './heading-ids.js';
+import type { MarkdocIntegrationOptions } from './options.js';
+import { htmlTag } from './html/tagdefs/html.tag.js';
 
 /**
  * Merge user config with default config and set up context (ex. heading ID slugger)
  * Called on each file's individual transform.
  * TODO: virtual module to merge configs per-build instead of per-file?
  */
-export async function setupConfig(userConfig: AstroMarkdocConfig = {}): Promise<MergedConfig> {
+export async function setupConfig(userConfig: AstroMarkdocConfig = {}, options: MarkdocIntegrationOptions | undefined): Promise<MergedConfig> {
 	let defaultConfig: AstroMarkdocConfig = setupHeadingConfig();
 
 	if (userConfig.extends) {
@@ -29,14 +31,26 @@ export async function setupConfig(userConfig: AstroMarkdocConfig = {}): Promise<
 		}
 	}
 
-	return mergeConfig(defaultConfig, userConfig);
+	let merged = mergeConfig(defaultConfig, userConfig);
+
+  if (options?.allowHTML) {
+    merged = mergeConfig(merged, HTML_CONFIG);
+  }
+
+  return merged;
 }
 
 /** Used for synchronous `getHeadings()` function */
-export function setupConfigSync(userConfig: AstroMarkdocConfig = {}): MergedConfig {
+export function setupConfigSync(userConfig: AstroMarkdocConfig = {}, options: MarkdocIntegrationOptions | undefined): MergedConfig {
 	const defaultConfig: AstroMarkdocConfig = setupHeadingConfig();
 
-	return mergeConfig(defaultConfig, userConfig);
+	let merged = mergeConfig(defaultConfig, userConfig);
+
+  if (options?.allowHTML) {
+    merged = mergeConfig(merged, HTML_CONFIG);
+  }
+
+  return merged;
 }
 
 type MergedConfig = Required<Omit<AstroMarkdocConfig, 'extends'>>;
@@ -146,12 +160,12 @@ export function collectHeadings(
 	}
 }
 
-export function createGetHeadings(stringifiedAst: string, userConfig: AstroMarkdocConfig) {
+export function createGetHeadings(stringifiedAst: string, userConfig: AstroMarkdocConfig, options: MarkdocIntegrationOptions | undefined) {
 	return function getHeadings() {
 		/* Yes, we are transforming twice (once from `getHeadings()` and again from <Content /> in case of variables).
 			TODO: propose new `render()` API to allow Markdoc variable passing to `render()` itself,
 			instead of the Content component. Would remove double-transform and unlock variable resolution in heading slugs. */
-		const config = setupConfigSync(userConfig);
+		const config = setupConfigSync(userConfig, options);
 		const ast = Markdoc.Ast.fromJSON(stringifiedAst);
 		const content = Markdoc.transform(ast as Node, config as ConfigType);
 		let collectedHeadings: MarkdownHeading[] = [];
@@ -164,6 +178,7 @@ export function createContentComponent(
 	Renderer: AstroInstance['default'],
 	stringifiedAst: string,
 	userConfig: AstroMarkdocConfig,
+  options: MarkdocIntegrationOptions | undefined,
 	tagComponentMap: Record<string, AstroInstance['default']>,
 	nodeComponentMap: Record<NodeType, AstroInstance['default']>
 ) {
@@ -171,7 +186,7 @@ export function createContentComponent(
 		async factory(result: any, props: Record<string, any>) {
 			const withVariables = mergeConfig(userConfig, { variables: props });
 			const config = resolveComponentImports(
-				await setupConfig(withVariables),
+				await setupConfig(withVariables, options),
 				tagComponentMap,
 				nodeComponentMap
 			);
@@ -181,3 +196,10 @@ export function createContentComponent(
 		propagation: 'self',
 	});
 }
+
+// statically define a partial MarkdocConfig which registers the required "html-tag" Markdoc tag when the "allowHTML" feature is enabled
+const HTML_CONFIG: AstroMarkdocConfig = {
+  tags: {
+    "html-tag": htmlTag,
+  },
+};
diff --git a/packages/integrations/markdoc/src/tokenizer.ts b/packages/integrations/markdoc/src/tokenizer.ts
new file mode 100644
index 000000000..11135c18e
--- /dev/null
+++ b/packages/integrations/markdoc/src/tokenizer.ts
@@ -0,0 +1,38 @@
+import type { Tokenizer } from '@markdoc/markdoc';
+import Markdoc from '@markdoc/markdoc';
+import type { MarkdocIntegrationOptions } from './options.js';
+
+type TokenizerOptions = ConstructorParameters<typeof Tokenizer>[0];
+
+export function getMarkdocTokenizer(options: MarkdocIntegrationOptions | undefined): Tokenizer {
+
+  const key = cacheKey(options);
+
+  if (!_cachedMarkdocTokenizers[key]) {
+
+    const tokenizerOptions: TokenizerOptions = {
+      // Strip <!-- comments --> from rendered output
+      // Without this, they're rendered as strings!
+      allowComments: true,
+    }
+
+    if (options?.allowHTML) {
+      // we want to allow indentation for Markdoc tags that are interleaved inside HTML block elements
+      tokenizerOptions.allowIndentation = true;
+      // enable HTML token detection in markdown-it
+      tokenizerOptions.html = true;
+    }
+
+    _cachedMarkdocTokenizers[key] = new Markdoc.Tokenizer(tokenizerOptions);
+  }
+
+  return _cachedMarkdocTokenizers[key];
+};
+
+// create this on-demand when needed since it relies on the runtime MarkdocIntegrationOptions and may change during
+// the life of module in certain scenarios (unit tests, etc.)
+let _cachedMarkdocTokenizers: Record<string, Tokenizer> = {};
+
+function cacheKey(options: MarkdocIntegrationOptions | undefined): string {
+  return JSON.stringify(options);
+}
author	Alex Sherwin <alex-sherwin@users.noreply.github.com>	2023-07-24 19:34:06 -0400
committer	GitHub <noreply@github.com>	2023-07-24 19:34:06 -0400
commit	7461e82c81438df956861197536f9ceeaf63d6b3 (patch)
tree	5d5318b0f8fd61eba466cbccb00c8e5035914b57 /packages/integrations/markdoc/src
parent	81c460e30b3318b9727a609390243f42e112ad24 (diff)
download	astro-7461e82c81438df956861197536f9ceeaf63d6b3.tar.gz astro-7461e82c81438df956861197536f9ceeaf63d6b3.tar.zst astro-7461e82c81438df956861197536f9ceeaf63d6b3.zip