summaryrefslogtreecommitdiff
path: root/packages/integrations/markdoc/src/html
diff options
context:
space:
mode:
Diffstat (limited to 'packages/integrations/markdoc/src/html')
-rw-r--r--packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts23
-rw-r--r--packages/integrations/markdoc/src/html/css/parse-inline-styles.ts278
-rw-r--r--packages/integrations/markdoc/src/html/css/style-to-object.ts70
-rw-r--r--packages/integrations/markdoc/src/html/index.ts2
-rw-r--r--packages/integrations/markdoc/src/html/tagdefs/html.tag.ts32
-rw-r--r--packages/integrations/markdoc/src/html/transform/html-token-transform.ts256
6 files changed, 661 insertions, 0 deletions
diff --git a/packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts b/packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts
new file mode 100644
index 000000000..3b67f9a32
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/css/parse-inline-css-to-react.ts
@@ -0,0 +1,23 @@
+
+import { styleToObject } from "./style-to-object.js";
+
+export function parseInlineCSSToReactLikeObject(css: string | undefined | null): React.CSSProperties | undefined {
+ if (typeof css === "string") {
+ const cssObject: Record<string, string> = {};
+ styleToObject(css, (originalCssDirective: string, value: string) => {
+ const reactCssDirective = convertCssDirectiveNameToReactCamelCase(originalCssDirective);
+ cssObject[reactCssDirective] = value;
+ });
+ return cssObject;
+ }
+
+ return undefined;
+}
+
+function convertCssDirectiveNameToReactCamelCase(original: string): string {
+ // capture group 1 is the character to capitalize, the hyphen is omitted by virtue of being outside the capture group
+ const replaced = original.replace(/-([a-z0-9])/ig, (_match, char) => {
+ return char.toUpperCase();
+ });
+ return replaced;
+}
diff --git a/packages/integrations/markdoc/src/html/css/parse-inline-styles.ts b/packages/integrations/markdoc/src/html/css/parse-inline-styles.ts
new file mode 100644
index 000000000..084ce546a
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/css/parse-inline-styles.ts
@@ -0,0 +1,278 @@
+// @ts-nocheck
+// https://github.com/remarkablemark/inline-style-parser
+
+/**
+ * @license MIT
+ *
+ * (The MIT License)
+ *
+ * Copyright (c) 2012 TJ Holowaychuk <tj@vision-media.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// http://www.w3.org/TR/CSS21/grammar.html
+// https://github.com/visionmedia/css-parse/pull/49#issuecomment-30088027
+const COMMENT_REGEX = /\/\*[^*]*\*+([^/*][^*]*\*+)*\//g;
+
+const NEWLINE_REGEX = /\n/g;
+const WHITESPACE_REGEX = /^\s*/;
+
+// declaration
+const PROPERTY_REGEX = /^(\*?[-#/*\\\w]+(\[[0-9a-z_-]+\])?)\s*/;
+const COLON_REGEX = /^:\s*/;
+const VALUE_REGEX = /^((?:'(?:\\'|.)*?'|"(?:\\"|.)*?"|\([^)]*?\)|[^};])+)/;
+const SEMICOLON_REGEX = /^[;\s]*/;
+
+// https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String/Trim#Polyfill
+const TRIM_REGEX = /^\s+|\s+$/g;
+
+// strings
+const NEWLINE = '\n';
+const FORWARD_SLASH = '/';
+const ASTERISK = '*';
+const EMPTY_STRING = '';
+
+// types
+const TYPE_COMMENT = 'comment';
+const TYPE_DECLARATION = 'declaration';
+
+/**
+ * @param {String} style
+ * @param {Object} [options]
+ * @return {Object[]}
+ * @throws {TypeError}
+ * @throws {Error}
+ */
+export function parseInlineStyles(style, options) {
+ if (typeof style !== 'string') {
+ throw new TypeError('First argument must be a string');
+ }
+
+ if (!style) return [];
+
+ options = options || {};
+
+ /**
+ * Positional.
+ */
+ let lineno = 1;
+ let column = 1;
+
+ /**
+ * Update lineno and column based on `str`.
+ *
+ * @param {String} str
+ */
+ function updatePosition(str) {
+ let lines = str.match(NEWLINE_REGEX);
+ if (lines) lineno += lines.length;
+ let i = str.lastIndexOf(NEWLINE);
+ column = ~i ? str.length - i : column + str.length;
+ }
+
+ /**
+ * Mark position and patch `node.position`.
+ *
+ * @return {Function}
+ */
+ function position() {
+ let start = { line: lineno, column: column };
+ return function (node) {
+ node.position = new Position(start);
+ whitespace();
+ return node;
+ };
+ }
+
+ /**
+ * Store position information for a node.
+ *
+ * @constructor
+ * @property {Object} start
+ * @property {Object} end
+ * @property {undefined|String} source
+ */
+ function Position(start) {
+ this.start = start;
+ this.end = { line: lineno, column: column };
+ this.source = options.source;
+ }
+
+ /**
+ * Non-enumerable source string.
+ */
+ Position.prototype.content = style;
+
+ const errorsList = [];
+
+ /**
+ * Error `msg`.
+ *
+ * @param {String} msg
+ * @throws {Error}
+ */
+ function error(msg) {
+ const err = new Error(
+ options.source + ':' + lineno + ':' + column + ': ' + msg
+ );
+ err.reason = msg;
+ err.filename = options.source;
+ err.line = lineno;
+ err.column = column;
+ err.source = style;
+
+ if (options.silent) {
+ errorsList.push(err);
+ } else {
+ throw err;
+ }
+ }
+
+ /**
+ * Match `re` and return captures.
+ *
+ * @param {RegExp} re
+ * @return {undefined|Array}
+ */
+ function match(re) {
+ const m = re.exec(style);
+ if (!m) return;
+ const str = m[0];
+ updatePosition(str);
+ style = style.slice(str.length);
+ return m;
+ }
+
+ /**
+ * Parse whitespace.
+ */
+ function whitespace() {
+ match(WHITESPACE_REGEX);
+ }
+
+ /**
+ * Parse comments.
+ *
+ * @param {Object[]} [rules]
+ * @return {Object[]}
+ */
+ function comments(rules) {
+ let c;
+ rules = rules || [];
+ while ((c = comment())) {
+ if (c !== false) {
+ rules.push(c);
+ }
+ }
+ return rules;
+ }
+
+ /**
+ * Parse comment.
+ *
+ * @return {Object}
+ * @throws {Error}
+ */
+ function comment() {
+ const pos = position();
+ if (FORWARD_SLASH != style.charAt(0) || ASTERISK != style.charAt(1)) return;
+
+ let i = 2;
+ while (
+ EMPTY_STRING != style.charAt(i) &&
+ (ASTERISK != style.charAt(i) || FORWARD_SLASH != style.charAt(i + 1))
+ ) {
+ ++i;
+ }
+ i += 2;
+
+ if (EMPTY_STRING === style.charAt(i - 1)) {
+ return error('End of comment missing');
+ }
+
+ const str = style.slice(2, i - 2);
+ column += 2;
+ updatePosition(str);
+ style = style.slice(i);
+ column += 2;
+
+ return pos({
+ type: TYPE_COMMENT,
+ comment: str
+ });
+ }
+
+ /**
+ * Parse declaration.
+ *
+ * @return {Object}
+ * @throws {Error}
+ */
+ function declaration() {
+ const pos = position();
+
+ // prop
+ const prop = match(PROPERTY_REGEX);
+ if (!prop) return;
+ comment();
+
+ // :
+ if (!match(COLON_REGEX)) return error("property missing ':'");
+
+ // val
+ const val = match(VALUE_REGEX);
+
+ const ret = pos({
+ type: TYPE_DECLARATION,
+ property: trim(prop[0].replace(COMMENT_REGEX, EMPTY_STRING)),
+ value: val
+ ? trim(val[0].replace(COMMENT_REGEX, EMPTY_STRING))
+ : EMPTY_STRING
+ });
+
+ // ;
+ match(SEMICOLON_REGEX);
+
+ return ret;
+ }
+
+ /**
+ * Parse declarations.
+ *
+ * @return {Object[]}
+ */
+ function declarations() {
+ const decls = [];
+
+ comments(decls);
+
+ // declarations
+ let decl;
+ while ((decl = declaration())) {
+ if (decl !== false) {
+ decls.push(decl);
+ comments(decls);
+ }
+ }
+
+ return decls;
+ }
+
+ whitespace();
+ return declarations();
+};
+
+/**
+ * Trim `str`.
+ *
+ * @param {String} str
+ * @return {String}
+ */
+function trim(str) {
+ return str ? str.replace(TRIM_REGEX, EMPTY_STRING) : EMPTY_STRING;
+} \ No newline at end of file
diff --git a/packages/integrations/markdoc/src/html/css/style-to-object.ts b/packages/integrations/markdoc/src/html/css/style-to-object.ts
new file mode 100644
index 000000000..7febe3152
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/css/style-to-object.ts
@@ -0,0 +1,70 @@
+// @ts-nocheck
+// https://github.com/remarkablemark/style-to-object
+
+/**
+ * @license MIT
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2017 Menglin "Mark" Xu <mark@remarkablemark.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+import { parseInlineStyles } from "./parse-inline-styles.js";
+
+/**
+ * Parses inline style to object.
+ *
+ * @example
+ * // returns { 'line-height': '42' }
+ * styleToObject('line-height: 42;');
+ *
+ * @param {String} style - The inline style.
+ * @param {Function} [iterator] - The iterator function.
+ * @return {null|Object}
+ */
+export function styleToObject(style, iterator) {
+ let output = null;
+ if (!style || typeof style !== 'string') {
+ return output;
+ }
+
+ let declaration;
+ let declarations = parseInlineStyles(style);
+ let hasIterator = typeof iterator === 'function';
+ let property;
+ let value;
+
+ for (let i = 0, len = declarations.length; i < len; i++) {
+ declaration = declarations[i];
+ property = declaration.property;
+ value = declaration.value;
+
+ if (hasIterator) {
+ iterator(property, value, declaration);
+ } else if (value) {
+ output || (output = {});
+ output[property] = value;
+ }
+ }
+
+ return output;
+}
diff --git a/packages/integrations/markdoc/src/html/index.ts b/packages/integrations/markdoc/src/html/index.ts
new file mode 100644
index 000000000..a456777f3
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/index.ts
@@ -0,0 +1,2 @@
+export { htmlTokenTransform } from "./transform/html-token-transform";
+export { htmlTag } from "./tagdefs/html.tag";
diff --git a/packages/integrations/markdoc/src/html/tagdefs/html.tag.ts b/packages/integrations/markdoc/src/html/tagdefs/html.tag.ts
new file mode 100644
index 000000000..ecbeddbc3
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/tagdefs/html.tag.ts
@@ -0,0 +1,32 @@
+import type { Config, Schema } from "@markdoc/markdoc";
+import Markdoc from "@markdoc/markdoc";
+
+// local
+import { parseInlineCSSToReactLikeObject } from "../css/parse-inline-css-to-react.js";
+
+// a Markdoc tag that will render a given HTML element and its attributes, as produced by the htmlTokenTransform function
+export const htmlTag: Schema<Config, never> = {
+
+ attributes: {
+ name: { type: String, required: true },
+ attrs: { type: Object },
+ },
+
+ transform(node, config) {
+
+ const { name, attrs: unsafeAttributes } = node.attributes;
+ const children = node.transformChildren(config);
+
+ // pull out any "unsafe" attributes which need additional processing
+ const { style, ...safeAttributes } = unsafeAttributes as Record<string, unknown>;
+
+ // if the inline "style" attribute is present we need to parse the HTML into a react-like React.CSSProperties object
+ if (typeof style === "string") {
+ const styleObject = parseInlineCSSToReactLikeObject(style);
+ safeAttributes.style = styleObject;
+ }
+
+ // create a Markdoc Tag for the given HTML node with the HTML attributes and children
+ return new Markdoc.Tag(name, safeAttributes, children);
+ },
+};
diff --git a/packages/integrations/markdoc/src/html/transform/html-token-transform.ts b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
new file mode 100644
index 000000000..6b2838ac3
--- /dev/null
+++ b/packages/integrations/markdoc/src/html/transform/html-token-transform.ts
@@ -0,0 +1,256 @@
+import type * as Token from 'markdown-it/lib/token';
+import { Parser } from 'htmlparser2';
+import { Tokenizer } from '@markdoc/markdoc';
+
+
+export function htmlTokenTransform(tokenizer: Tokenizer, tokens: Token[]): Token[] {
+
+ const output: Token[] = [];
+
+ // hold a lazy buffer of text and process it only when necessary
+ let textBuffer = '';
+
+ let inCDATA = false;
+
+ const appendText = (text: string) => {
+ textBuffer += text;
+ };
+
+ // process the current text buffer w/ Markdoc's Tokenizer for tokens
+ const processTextBuffer = () => {
+
+ if (textBuffer.length > 0) {
+
+ // tokenize the text buffer to look for structural markup tokens
+ const toks = tokenizer.tokenize(textBuffer);
+
+ // when we tokenize some raw text content, it's basically treated like Markdown, and will result in a paragraph wrapper, which we don't want
+ // in this scenario, we just want to generate a text token, but, we have to tokenize it in case there's other structural markup
+ if (toks.length === 3) {
+
+ const first = toks[0];
+ const second = toks[1];
+ const third: Token | undefined = toks.at(2);
+
+ if (first.type === 'paragraph_open' && second.type === 'inline' && (third && third.type === 'paragraph_close') && Array.isArray(second.children)) {
+ for (const tok of second.children as Token[]) {
+ // if the given token is a 'text' token and its trimmed content is the same as the pre-tokenized text buffer, use the original
+ // text buffer instead to preserve leading/trailing whitespace that is lost during tokenization of pure text content
+ if (tok.type === 'text') {
+ if (tok.content.trim() == textBuffer.trim()) {
+ tok.content = textBuffer;
+ }
+ }
+ output.push(tok);
+ }
+ } else {
+ // some other markup that happened to be 3 tokens, push tokens as-is
+ for (const tok of toks) {
+ output.push(tok);
+ }
+ }
+ } else {
+ // some other tokenized markup, push tokens as-is
+ for (const tok of toks) {
+ output.push(tok);
+ }
+ }
+
+ // reset the current lazy text buffer
+ textBuffer = '';
+ }
+ };
+
+ // create an incremental HTML parser that tracks HTML tag open, close and text content
+ const parser = new Parser({
+
+ oncdatastart() {
+ inCDATA = true;
+ },
+
+ oncdataend() {
+ inCDATA = false;
+ },
+
+ // when an HTML tag opens...
+ onopentag(name, attrs) {
+
+ // process any buffered text to be treated as text node before the currently opening HTML tag
+ processTextBuffer();
+
+ // push an 'html-tag' 'tag_open' Markdoc node instance for the currently opening HTML tag onto the resulting Token stack
+ output.push({
+ type: 'tag_open',
+ nesting: 1,
+ meta: {
+ tag: 'html-tag',
+ attributes: [
+ { type: 'attribute', name: 'name', value: name },
+ { type: 'attribute', name: 'attrs', value: attrs },
+ ],
+ },
+ } as Token);
+
+ },
+
+ ontext(content: string | null | undefined) {
+
+ if (inCDATA) {
+ // ignore entirely while inside CDATA
+ return;
+ }
+
+ // only accumulate text into the buffer if we're not under an ignored HTML element
+ if (typeof content === 'string') {
+ appendText(content);
+ }
+ },
+
+ // when an HTML tag closes...
+ onclosetag(name) {
+
+ // process any buffered text to be treated as a text node inside the currently closing HTML tag
+ processTextBuffer();
+
+ // push an 'html-tag' 'tag_close' Markdoc node instance for the currently closing HTML tag onto the resulting Token stack
+ output.push({
+ type: 'tag_close',
+ nesting: -1,
+ meta: {
+ tag: 'html-tag',
+ attributes: [
+ { type: 'attribute', name: 'name', value: name },
+ ],
+ },
+ } as Token);
+
+ },
+
+ }, {
+ decodeEntities: false,
+ recognizeCDATA: true,
+ recognizeSelfClosing: true,
+ });
+
+ // for every detected token...
+ for (const token of tokens) {
+
+ // if it was an HTML token, write the HTML text into the HTML parser
+ if (token.type.startsWith('html')) {
+
+ // as the parser encounters opening/closing HTML tags, it will push Markdoc Tag nodes into the output stack
+ parser.write(token.content);
+
+ // continue loop... IMPORTANT! we're throwing away the original 'html' tokens here (raw HTML strings), since the parser is inserting new ones based on the parsed HTML
+ continue;
+ }
+
+ // process any child content for HTML
+ if (token.type === 'inline') {
+ if (token.children) {
+ token.children = htmlTokenTransform(tokenizer, token.children);
+ }
+ }
+
+ // not an HTML Token, preserve it at the current stack location
+ output.push(token);
+ }
+
+ // process any remaining buffered text
+ processTextBuffer();
+
+ //
+ // post-process the current levels output Token[] array to un-wind this pattern:
+ //
+ // [
+ // { type: tag_open, meta.tag: html-tag },
+ // { type: paragraph_open },
+ // { type: inline, children [...] },
+ // { type: paragraph_close },
+ // { type: tag_close, meta.tag: html-tag }
+ // ]
+ //
+ // the paragraph_open, inline, paragraph_close triplet needs to be replaced by the children of the inline node
+ //
+ // this is extra, unwanted paragraph wrapping unfortunately introduced by markdown-it during processing w/ HTML enabled
+ //
+
+ mutateAndCollapseExtraParagraphsUnderHtml(output);
+
+ return output;
+}
+
+function mutateAndCollapseExtraParagraphsUnderHtml(tokens: Token[]): void {
+ let done = false;
+
+ while (!done) {
+ const idx = findExtraParagraphUnderHtml(tokens);
+ if (typeof idx === 'number') {
+ // mutate
+
+ const actualChildTokens = tokens[idx + 2].children ?? [];
+
+ tokens.splice(idx, 5, ...actualChildTokens);
+ } else {
+ done = true;
+ }
+ }
+}
+
+
+/**
+ *
+ * @param token
+ * @returns
+ */
+function findExtraParagraphUnderHtml(tokens: Token[]): number | null {
+
+ if (tokens.length < 5) {
+ return null;
+ }
+
+ for (let i = 0; i < tokens.length; i++) {
+ const last = i + 4;
+ if (last > tokens.length - 1) {
+ break; // early exit, no more possible 5-long slices to search
+ }
+
+ const slice = tokens.slice(i, last + 1);
+ const isMatch = isExtraParagraphPatternMatch(slice);
+ if (isMatch) {
+ return i;
+ }
+ }
+
+ return null;
+}
+
+function isExtraParagraphPatternMatch(slice: Token[]): boolean {
+ const match = isHtmlTagOpen(slice[0])
+ && isParagraphOpen(slice[1])
+ && isInline(slice[2])
+ && isParagraphClose(slice[3])
+ && isHtmlTagClose(slice[4]);
+ return match;
+}
+
+
+function isHtmlTagOpen(token: Token): boolean {
+ return token.type === 'tag_open' && token.meta && token.meta.tag === 'html-tag';
+}
+
+function isHtmlTagClose(token: Token): boolean {
+ return token.type === 'tag_close' && token.meta && token.meta.tag === 'html-tag';
+}
+
+function isParagraphOpen(token: Token): boolean {
+ return token.type === 'paragraph_open';
+}
+
+function isParagraphClose(token: Token): boolean {
+ return token.type === 'paragraph_close';
+}
+
+function isInline(token: Token): boolean {
+ return token.type === 'inline';
+}