diff --git a/lib/index.js b/lib/index.js index 1a5b95d..ce6587c 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,11 +1,12 @@ /** * @typedef {import('vfile').VFile} VFile - * @typedef {import('parse5').Document} P5Document - * @typedef {import('parse5').DocumentFragment} P5Fragment - * @typedef {Omit} P5Element - * @typedef {import('parse5').Attribute} P5Attribute - * @typedef {Omit & {startOffset: number|undefined, endOffset: number|undefined}} P5Location - * @typedef {import('parse5').ParserOptions} P5ParserOptions + * @typedef {import('parse5').DefaultTreeAdapterMap} P5Tree + * @typedef {P5Tree['document']} P5Document + * @typedef {P5Tree['documentFragment']} P5Fragment + * @typedef {import('parse5').Token.Attribute} P5Attribute + * @typedef {Omit & {startOffset: number|undefined, endOffset: number|undefined}} P5Location + * @typedef {import('parse5').ParserOptions} P5ParserOptions + * @typedef {import('unist').Point} Point * @typedef {import('hast').Root} Root * @typedef {import('hast').DocType} Doctype * @typedef {import('hast').Element} Element @@ -18,76 +19,21 @@ * @typedef {Omit & {value: {stitch: Node}}} Stitch * * @typedef Options + * Configuration (optional). * @property {Array} [passThrough] * List of custom hast node types to pass through (keep) in hast. * If the passed through nodes have children, those children are expected to * be hast and will be handled. - * - * @typedef HiddenTokenizer - * @property {Array} __mixins - * Way too simple, but works for us. - * @property {HiddenPreprocessor} preprocessor - * @property {(value: string) => void} write - * @property {() => number} _consume - * @property {Array} tokenQueue - * @property {string} state - * @property {string} returnState - * @property {number} charRefCode - * @property {Array} tempBuff - * @property {Function} _flushCodePointsConsumedAsCharacterReference - * @property {string} lastStartTagName - * @property {number} consumedAfterSnapshot - * @property {boolean} active - * @property {HiddenToken|undefined} currentCharacterToken - * @property {HiddenToken|undefined} currentToken - * @property {unknown} currentAttr - * @property {Function} NAMED_CHARACTER_REFERENCE_STATE - * @property {Function} NUMERIC_CHARACTER_REFERENCE_END_STATE - * - * @typedef {Record & {location: P5Location}} HiddenToken - * - * @typedef HiddenPreprocessor - * @property {string|undefined} html - * @property {number} pos - * @property {number} lastGapPos - * @property {number} lastCharPos - * @property {Array} gapStack - * @property {boolean} skipNextNewLine - * @property {boolean} lastChunkWritten - * @property {boolean} endOfChunkHit - * - * @typedef HiddenLocationTracker - * @property {P5Location|undefined} currentAttrLocation - * @property {P5Location} ctLoc - * @property {HiddenPosTracker} posTracker - * - * @typedef HiddenPosTracker - * @property {boolean} isEol - * @property {number} lineStartPos - * @property {number} droppedBufferSize - * @property {number} offset - * @property {number} col - * @property {number} line */ -// @ts-expect-error: untyped. -import Parser from 'parse5/lib/parser/index.js' +import {Parser, Token, TokenizerMode, html} from 'parse5' import {pointStart, pointEnd} from 'unist-util-position' import {visit} from 'unist-util-visit' import {fromParse5} from 'hast-util-from-parse5' import {toParse5} from 'hast-util-to-parse5' import {htmlVoidElements} from 'html-void-elements' -import {webNamespaces} from 'web-namespaces' import {zwitch} from 'zwitch' -const inTemplateMode = 'IN_TEMPLATE_MODE' -const dataState = 'DATA_STATE' -const characterToken = 'CHARACTER_TOKEN' -const startTagToken = 'START_TAG_TOKEN' -const endTagToken = 'END_TAG_TOKEN' -const commentToken = 'COMMENT_TOKEN' -const doctypeToken = 'DOCTYPE_TOKEN' - /** @type {P5ParserOptions} */ const parseOptions = {sourceCodeLocationInfo: true, scriptingEnabled: false} @@ -117,7 +63,7 @@ export const raw = */ function (tree, file, options) { let index = -1 - const parser = new Parser(parseOptions) + /** @type {(node: Node, parser: Parser) => void} */ const one = zwitch('type', { handlers: {root, element, text, comment, doctype, raw: handleRaw}, // @ts-expect-error: hush. @@ -125,14 +71,6 @@ export const raw = }) /** @type {boolean|undefined} */ let stitches - /** @type {HiddenTokenizer|undefined} */ - let tokenizer - /** @type {HiddenPreprocessor|undefined} */ - let preprocessor - /** @type {HiddenPosTracker|undefined} */ - let posTracker - /** @type {HiddenLocationTracker|undefined} */ - let locationTracker if (isOptions(file)) { options = file @@ -146,10 +84,8 @@ export const raw = } } - const result = fromParse5( - documentMode(tree) ? document() : fragment(), - file - ) + const p5 = documentMode(tree) ? document() : fragment() + const result = fromParse5(p5, file) if (stitches) { visit(result, 'comment', (node, index, parent) => { @@ -177,215 +113,213 @@ export const raw = * @returns {P5Fragment} */ function fragment() { - /** @type {P5Element} */ - const context = { - nodeName: 'template', - tagName: 'template', - attrs: [], - namespaceURI: webNamespaces.html, - childNodes: [] - } - /** @type {P5Element} */ - const mock = { - nodeName: 'documentmock', - tagName: 'documentmock', - attrs: [], - namespaceURI: webNamespaces.html, - childNodes: [] - } - /** @type {P5Fragment} */ - const doc = {nodeName: '#document-fragment', childNodes: []} - - parser._bootstrap(mock, context) - parser._pushTmplInsertionMode(inTemplateMode) - parser._initTokenizerForFragmentParsing() - parser._insertFakeRootElement() - parser._resetInsertionMode() - parser._findFormInFragmentContext() - - tokenizer = parser.tokenizer - /* c8 ignore next */ - if (!tokenizer) throw new Error('Expected `tokenizer`') - preprocessor = tokenizer.preprocessor - locationTracker = tokenizer.__mixins[0] - posTracker = locationTracker.posTracker - - one(tree) - - resetTokenizer() - - parser._adoptNodes(mock.childNodes[0], doc) - - return doc + /** @type {Parser} */ + const p = Parser.getFragmentParser(null, parseOptions) + one(tree, p) + resetTokenizer(p, pointStart(undefined)) + return p.getFragment() } /** * @returns {P5Document} */ function document() { - /** @type {P5Document} */ - const doc = parser.treeAdapter.createDocument() - - parser._bootstrap(doc, undefined) - tokenizer = parser.tokenizer - /* c8 ignore next */ - if (!tokenizer) throw new Error('Expected `tokenizer`') - preprocessor = tokenizer.preprocessor - locationTracker = tokenizer.__mixins[0] - posTracker = locationTracker.posTracker - - one(tree) - - resetTokenizer() - - return doc + /** @type {Parser} */ + const p = new Parser(parseOptions) + one(tree, p) + resetTokenizer(p, pointStart(undefined)) + return p.document } /** * @param {Array} nodes + * @param {Parser} p * @returns {void} */ - function all(nodes) { + function all(nodes, p) { let index = -1 /* istanbul ignore else - invalid nodes, see rehypejs/rehype-raw#7. */ if (nodes) { while (++index < nodes.length) { - one(nodes[index]) + one(nodes[index], p) } } } /** * @param {Root} node + * @param {Parser} p * @returns {void} */ - function root(node) { - all(node.children) + function root(node, p) { + all(node.children, p) } /** * @param {Element} node + * @param {Parser} p * @returns {void} */ - function element(node) { - resetTokenizer() - parser._processInputToken(startTag(node)) + function element(node, p) { + resetTokenizer(p, pointStart(node)) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.currentToken = startTag(node) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p._processToken(p.currentToken) - all(node.children) + all(node.children, p) if (!htmlVoidElements.includes(node.tagName)) { - resetTokenizer() - parser._processInputToken(endTag(node)) + resetTokenizer(p, pointEnd(node)) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.currentToken = endTag(node) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p._processToken(p.currentToken) } } /** * @param {Text} node + * @param {Parser} p * @returns {void} */ - function text(node) { - resetTokenizer() - parser._processInputToken({ - type: characterToken, + function text(node, p) { + /** @type {import('parse5/dist/common/token').CharacterToken} */ + const token = { + type: Token.TokenType.CHARACTER, chars: node.value, + // @ts-expect-error: fine. location: createParse5Location(node) - }) + } + + resetTokenizer(p, pointStart(node)) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.currentToken = token + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p._processToken(p.currentToken) } /** * @param {Doctype} node + * @param {Parser} p * @returns {void} */ - function doctype(node) { - resetTokenizer() - parser._processInputToken({ - type: doctypeToken, + function doctype(node, p) { + /** @type {import('parse5/dist/common/token').DoctypeToken} */ + const token = { + type: Token.TokenType.DOCTYPE, name: 'html', forceQuirks: false, publicId: '', systemId: '', + // @ts-expect-error: fine. location: createParse5Location(node) - }) + } + + resetTokenizer(p, pointStart(node)) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.currentToken = token + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p._processToken(p.currentToken) } /** * @param {Comment|Stitch} node + * @param {Parser} p * @returns {void} */ - function comment(node) { - resetTokenizer() - parser._processInputToken({ - type: commentToken, + function comment(node, p) { + /** @type {import('parse5/dist/common/token').CommentToken} */ + const token = { + type: Token.TokenType.COMMENT, + // @ts-expect-error: yeah, we’re passing stiches through. data: node.value, + // @ts-expect-error: fine. location: createParse5Location(node) - }) + } + resetTokenizer(p, pointStart(node)) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.currentToken = token + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p._processToken(p.currentToken) } /** * @param {Raw} node + * @param {Parser} p * @returns {void} */ - function handleRaw(node) { - const start = pointStart(node) - const line = start.line || 1 - const column = start.column || 1 - const offset = start.offset || 0 - - /* c8 ignore next 4 */ - if (!preprocessor) throw new Error('Expected `preprocessor`') - if (!tokenizer) throw new Error('Expected `tokenizer`') - if (!posTracker) throw new Error('Expected `posTracker`') - if (!locationTracker) throw new Error('Expected `locationTracker`') - + function handleRaw(node, p) { // Reset preprocessor: - // See: . - preprocessor.html = undefined - preprocessor.pos = -1 - preprocessor.lastGapPos = -1 - preprocessor.lastCharPos = -1 - preprocessor.gapStack = [] - preprocessor.skipNextNewLine = false - preprocessor.lastChunkWritten = false - preprocessor.endOfChunkHit = false - - // Reset preprocessor mixin: - // See: . - posTracker.isEol = false - posTracker.lineStartPos = -column + 1 // Looks weird, but ensures we get correct positional info. - posTracker.droppedBufferSize = offset - posTracker.offset = 0 - posTracker.col = 1 - posTracker.line = line - - // Reset location tracker: - // See: . - locationTracker.currentAttrLocation = undefined - locationTracker.ctLoc = createParse5Location(node) - - // See the code for `parse` and `parseFragment`: - // See: . - tokenizer.write(node.value) - parser._runParsingLoop(null) + // See: . + p.tokenizer.preprocessor.html = '' + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.pos = -1 + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.lastGapPos = -2 + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.gapStack = [] + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.skipNextNewLine = false + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.lastChunkWritten = false + p.tokenizer.preprocessor.endOfChunkHit = false + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.isEol = false + + // Now pass `node.value`. + setPoint(p, pointStart(node)) + p.tokenizer.write(node.value, false) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer._runParsingLoop() // Character references hang, so if we ended there, we need to flush // those too. // We reset the preprocessor as if the document ends here. // Then one single call to the relevant state does the trick, parse5 // consumes the whole token. + + // Note: `State` is not exposed by `parse5`, so these numbers are fragile. if ( - tokenizer.state === 'NAMED_CHARACTER_REFERENCE_STATE' || - tokenizer.state === 'NUMERIC_CHARACTER_REFERENCE_END_STATE' + p.tokenizer.state === 72 /* NAMED_CHARACTER_REFERENCE */ || + p.tokenizer.state === 78 /* NUMERIC_CHARACTER_REFERENCE_END */ ) { - preprocessor.lastChunkWritten = true - tokenizer[tokenizer.state](tokenizer._consume()) + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.lastChunkWritten = true + /** @type {number} */ + // @ts-expect-error: private. + // type-coverage:ignore-next-line + const cp = p.tokenizer._consume() + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer._callState(cp) } } /** * @param {Node} node + * @param {Parser} p */ - function stitch(node) { + function stitch(node, p) { stitches = true /** @type {Node} */ @@ -410,64 +344,124 @@ export const raw = // Hack: `value` is supposed to be a string, but as none of the tools // (`parse5` or `hast-util-from-parse5`) looks at it, we can pass nodes // through. - comment({type: 'comment', value: {stitch: clone}}) + comment({type: 'comment', value: {stitch: clone}}, p) } - function resetTokenizer() { - /* c8 ignore next 2 */ - if (!tokenizer) throw new Error('Expected `tokenizer`') - if (!posTracker) throw new Error('Expected `posTracker`') + /** + * @param {Parser} p + * @param {Point} point + */ + function setPoint(p, point) { + if ( + point.line && + point.column && + point.offset !== null && + point.offset !== undefined + ) { + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.preprocessor.lineStartPos = -point.column + 1 // Looks weird, but ensures we get correct positional info. + p.tokenizer.preprocessor.droppedBufferSize = point.offset + p.tokenizer.preprocessor.line = point.line + + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.currentLocation = { + startLine: point.line, + startCol: point.column, + startOffset: point.offset, + endLine: -1, + endCol: -1, + endOffset: -1 + } + } + } + /** + * @param {Parser} p + * @param {Point} point + */ + function resetTokenizer(p, point) { + setPoint(p, point) // Process final characters if they’re still there after hibernating. // Similar to: // See: . - const token = tokenizer.currentCharacterToken - - if (token) { - token.location.endLine = posTracker.line - token.location.endCol = posTracker.col + 1 - token.location.endOffset = posTracker.offset + 1 - parser._processInputToken(token) + /** @type {import('parse5/dist/common/token').CharacterToken} */ + // @ts-expect-error: private. + // type-coverage:ignore-next-line + const token = p.tokenizer.currentCharacterToken + + if (token && token.location) { + token.location.endLine = p.tokenizer.preprocessor.line + token.location.endCol = p.tokenizer.preprocessor.col + 1 + token.location.endOffset = p.tokenizer.preprocessor.offset + 1 + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.currentToken = token + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p._processToken(p.currentToken) } // Reset tokenizer: - // See: . + // See: . // Especially putting it back in the `data` state is useful: some elements, // like textareas and iframes, change the state. // See GH-7. // But also if broken HTML is in `raw`, and then a correct element is given. // See GH-11. - tokenizer.tokenQueue = [] - tokenizer.state = dataState - tokenizer.returnState = '' - tokenizer.charRefCode = -1 - tokenizer.tempBuff = [] - tokenizer.lastStartTagName = '' - tokenizer.consumedAfterSnapshot = -1 - tokenizer.active = false - tokenizer.currentCharacterToken = undefined - tokenizer.currentToken = undefined - tokenizer.currentAttr = undefined + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.paused = false + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.inLoop = false + + // Note: don’t reset `inForeignNode` so that the state of HTML in SVG + // in HTML etc is kept. + + p.tokenizer.lastStartTagName = '' + p.tokenizer.active = false + p.tokenizer.state = TokenizerMode.DATA + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.returnState = TokenizerMode.DATA + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.charRefCode = -1 + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.consumedAfterSnapshot = -1 + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.currentLocation = null + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.currentCharacterToken = null + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.currentToken = null + // @ts-expect-error: private. + // type-coverage:ignore-next-line + p.tokenizer.currentAttr = {name: '', value: ''} } } ) + /** * @param {Element} node - * @returns {HiddenToken} + * @returns {import('parse5/dist/common/token').TagToken} */ function startTag(node) { - /** @type {P5Location} */ - const location = Object.assign(createParse5Location(node)) - // @ts-expect-error extra positional info. - location.startTag = Object.assign({}, location) - - // Untyped token. return { - type: startTagToken, + type: Token.TokenType.START_TAG, tagName: node.tagName, + tagID: html.getTagID(node.tagName), selfClosing: false, + ackSelfClosing: false, attrs: attributes(node), - location + // @ts-expect-error: fine. + location: createParse5Location(node) } } @@ -476,31 +470,31 @@ function startTag(node) { * @returns {Array} */ function attributes(node) { - return toParse5({ + const result = toParse5({ tagName: node.tagName, type: 'element', properties: node.properties, children: [] - // @ts-expect-error Assume element. - }).attrs + }) + // Always element. + /* c8 ignore next */ + return 'attrs' in result ? result.attrs : [] } /** * @param {Element} node - * @returns {HiddenToken} + * @returns {import('parse5/dist/common/token').TagToken} */ function endTag(node) { - /** @type {P5Location} */ - const location = Object.assign(createParse5Location(node)) - // @ts-expect-error extra positional info. - location.startTag = Object.assign({}, location) - - // Untyped token. return { - type: endTagToken, + type: Token.TokenType.END_TAG, tagName: node.tagName, + tagID: html.getTagID(node.tagName), + selfClosing: false, + ackSelfClosing: false, attrs: [], - location + // @ts-expect-error: fine. + location: createParse5Location(node) } } diff --git a/package.json b/package.json index 01a6e1b..7afffc0 100644 --- a/package.json +++ b/package.json @@ -36,15 +36,13 @@ ], "dependencies": { "@types/hast": "^2.0.0", - "@types/parse5": "^6.0.0", "hast-util-from-parse5": "^7.0.0", "hast-util-to-parse5": "^7.0.0", "html-void-elements": "^2.0.0", - "parse5": "^6.0.0", + "parse5": "^7.0.0", "unist-util-position": "^4.0.0", "unist-util-visit": "^4.0.0", "vfile": "^5.0.0", - "web-namespaces": "^2.0.0", "zwitch": "^2.0.0" }, "devDependencies": { diff --git a/test.js b/test.js index b20841c..3b91713 100644 --- a/test.js +++ b/test.js @@ -359,7 +359,7 @@ test('integration', (t) => { value: 'Hello, world!\n', position: { start: {line: 3, column: 4, offset: 22}, - end: null + end: undefined } } ], @@ -645,13 +645,13 @@ test('integration', (t) => { value: 'Hello, world!', position: { start: {line: 20, column: 4, offset: 256}, - end: {line: 20, column: 17, offset: 270} + end: {line: 20, column: 17, offset: 269} } } ], position: { start: {line: 20, column: 1, offset: 253}, - end: {line: 20, column: 17, offset: 270} + end: {line: 20, column: 17, offset: 269} } } ],