/** * Set of nodes that have already been processed, used to avoid duplicating text extraction. * * @type {Set} */ #processedNodes = new Set(); /** * The text-extraction options, provided at initialization. * * @type {GetTextOptions} */ #options; /** * The accumulated text content that has been extracted from the DOM. * * @type {string} */ #textContent = ""; /** * When extracting content just from the viewport, this value will be set. * * @type {{ top: number; left: number; right: number; bottom: number } | null} */ #viewportRect = null; /** * Constructs a new extraction context with the provided options. * * @param {Document} document * @param {GetTextOptions} options */ constructor(document, options) { this.#options = options; if (options.justViewport) { const { visualViewport } = document.defaultView; const { offsetTop, offsetLeft, width, height } = visualViewport; this.#viewportRect = { top: offsetTop, left: offsetLeft, right: offsetLeft + width, bottom: offsetTop + height, }; } } /** * Accumulated text content produced during traversal. * * @returns {string} */ get textContent() { return this.#textContent; } /** * Returns true if a condition has been met such that the text * extraction should stop early, otherwise false. * * @returns {boolean} */ shouldStopExtraction() { const { sufficientLength } = this.#options; if ( sufficientLength !== undefined && this.#textContent.length >= sufficientLength ) { return true; } return false; } /** * Returns true if this node or its ancestor's text content has * already been extracted from the DOM. * * @param {Node} node */ #isNodeProcessed(node) { if (this.#processedNodes.has(node)) { return true; } for (const ancestor of getAncestorsIterator(node)) { if (this.#processedNodes.has(ancestor)) { return true; } } return false; } /** * When capturing content only in the viewport, skip nodes that are outside of it. * * @param {Node} node */ maybeOutOfViewport(node) { if (!this.#viewportRect) { // We don't have a viewport rect, so skip this check. return false; } const element = getHTMLElementForStyle(node); if (!element) { return false; } const rect = element.getBoundingClientRect(); if (!rect) { return false; } return ( rect.bottom <= this.#viewportRect.top || rect.top >= this.#viewportRect.bottom || rect.right <= this.#viewportRect.left || rect.left >= this.#viewportRect.right ); } /** * Append the node's text content to the accumulated text only if the node * itself as well as no ancestor of the node has already been processed. * * @param {Node} node */ maybeAppendTextContent(node) { if (this.#isNodeProcessed(node)) { return; } this.#processedNodes.add(node); if (isNodeHidden(node)) { return; } if (this.maybeOutOfViewport(node)) { // This only can return true when we're capturing just the viewport nodes. return; } const element = asHTMLElement(node); const text = asTextNode(node); let innerText = ""; if (element) { innerText = element.innerText.trim(); } else if (text?.nodeValue) { innerText = text.nodeValue.trim(); } if (innerText) { this.#textContent += "\n" + innerText; } } } /** * Extracts visible text content from the DOM. * * By default, this extracts content from the entire page. * * Callers may specify filters for the extracted text via * the supported options @see {GetTextOptions}. * * @param {Document} document * @param {GetTextOptions} options * * @returns {string} */ export function extractTextFromDOM(document, options) { const context = new ExtractionContext(document, options); subdivideAndExtractText(document.body, context); return context.textContent.trim(); } /** * Tags excluded from text extraction. */ const CONTENT_EXCLUDED_TAGS = new Set([ // TODO - We should add this and write some tests. "CODE", // The following are deprecated tags. "DIR", "APPLET", // The following are embedded elements, and are not supported (yet). "MATH", "EMBED", "OBJECT", "IFRAME", // This is an SVG tag that can contain arbitrary XML, ignore it. "METADATA", // These are elements that are treated as opaque by Firefox which causes their // innerHTML property to be just the raw text node behind it. Any text that is sent as // HTML must be valid, and there is no guarantee that the innerHTML is valid. "NOSCRIPT", "NOEMBED", "NOFRAMES", // Do not parse the HEAD tag. "HEAD", // These are not user-visible tags. "STYLE", "SCRIPT", "TEMPLATE", ]); const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(","); /** * Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API. * This allows for extracting the content from WebComponents, which is not * normally feasible in non-privileged contexts. * * @param {Node} node * * @returns {ShadowRoot | null} */ function getShadowRoot(node) { return asElement(node)?.openOrClosedShadowRoot ?? null; } /** * Determines if a node is ready for text extraction, or if it should be subdivided * further. It doesn't check if the node has already been processed. This id done * at the block level. * * @param {Node} node * @returns {number} - NodeFilter acceptance status. */ function determineBlockStatus(node) { if (!node) { return NodeFilter.FILTER_REJECT; } if (getShadowRoot(node)) { return NodeFilter.FILTER_ACCEPT; } if (isExcludedNode(node)) { // This is an explicit. return NodeFilter.FILTER_REJECT; } if ( containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) && !hasNonWhitespaceTextNodes(node) ) { // Skip this node, and dig deeper into its tree to cut off smaller pieces to extract. return NodeFilter.FILTER_SKIP; } if (nodeNeedsSubdividing(node)) { // Skip this node, and dig deeper into its tree to cut off smaller pieces // to extract. It is presumed to be a wrapper of block elements. return NodeFilter.FILTER_SKIP; } // This textContent call is fairly expensive. if (!node.textContent?.trim().length) { // Do not use subtrees that are empty of text. return !node.hasChildNodes() ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_SKIP; } // This node can be treated as entire block and is ready for text extraction. return NodeFilter.FILTER_ACCEPT; } /** * Determine if this element is an inline element or a block element. * * @param {Node} node * @returns {boolean} */ function nodeNeedsSubdividing(node) { const element = asElement(node); if (!element) { // Only elements need to be further subdivided. return false; } for (let childNode of element.childNodes) { if (!childNode) { continue; } switch (childNode.nodeType) { case Node.TEXT_NODE: { // Keep checking for more inline or text nodes. continue; } case Node.ELEMENT_NODE: { if (getIsBlockLike(childNode)) { // This node is a block node, so it needs further subdividing. return true; } else if (nodeNeedsSubdividing(childNode)) { // This non-block-like node may contain other block-like nodes. return true; } // Keep checking for more inline or text nodes. continue; } default: { return true; } } } return false; } /** * Returns true if a node is hidden based on factors such as collapsed state and * computed style, otherwise false. * * @param {Node} node * @returns {boolean} */ function isNodeHidden(node) { const element = getHTMLElementForStyle(node); if (!element) { return true; } // This is a cheap and easy check that will not compute style or force reflow. if (element.hidden) { // The element is explicitly hidden. return true; } // Handle open/closed
elements. This will also not compute style or force reflow. // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/details if ( // The element is within a closed
element.closest("details:not([open])") && // The element is not part of the of the
, which is always visible, even when closed. !element.closest("summary") ) { // The element is within a closed
and is not part of the , therefore it is not visible. return true; } // This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible. // https://github.com/jquery/jquery/blob/bd6b453b7effa78b292812dbe218491624994526/src/css/hiddenVisibleSelectors.js#L1-L10 if ( !( element.offsetWidth || element.offsetHeight || element.getClientRects().length ) ) { return true; } // The element may still have a zero-sized bounding client rectangle. const boundingClientRect = element.getBoundingClientRect(); if ( boundingClientRect && (boundingClientRect.width === 0 || boundingClientRect.height === 0) ) { return true; } const { ownerGlobal } = element; if (!ownerGlobal) { // We cannot compute the style without ownerGlobal, so we will assume it is not visible. return true; } // This flushes the style, which is a performance cost. const style = ownerGlobal.getComputedStyle(element); if (!style) { // We were unable to compute the style, so we will assume it is not visible. return true; } // This is an issue with the DOM library generation. const { display, visibility, opacity } = style; return ( display === "none" || visibility === "hidden" || visibility === "collapse" || opacity === "0" ); } /** * @param {Node} node */ function isExcludedNode(node) { // Property access be expensive, so destructure required properties so they are // not accessed multiple times. const { nodeType } = node; if (nodeType === Node.TEXT_NODE) { // Text nodes are never excluded. return false; } const element = asElement(node); if (!element) { // Only elements and and text nodes should be considered. return true; } const { nodeName } = element; if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) { // SVG tags can be lowercased, so ensure everything is uppercased. // This is an excluded tag. return true; } return false; } /** * Like `#isExcludedNode` but looks at the full subtree. Used to see whether * we can consider a subtree, or whether we should split it into smaller * branches first to try to exclude more of the content. * * @param {Node} node * @param {string} excludedNodeSelector * * @returns {boolean} */ function containsExcludedNode(node, excludedNodeSelector) { return Boolean(asElement(node)?.querySelector(excludedNodeSelector)); } /** * Test whether any of the direct child text nodes of are non-whitespace text nodes. * * For example: * - `

test

`: yes * - `

`: no * - `

test

`: no * * @param {Node} node * * @returns {boolean} */ function hasNonWhitespaceTextNodes(node) { if (node.nodeType !== Node.ELEMENT_NODE) { // Only check element nodes. return false; } for (const child of node.childNodes) { const textNode = asTextNode(child); if (textNode) { if (!textNode.textContent?.trim()) { // This is just whitespace. continue; } // A text node with content was found. return true; } } // No text nodes were found. return false; } /** * Start walking down through a node's subtree and decide which nodes to extract content * from. This first node is the root of the page. * * The nodes go through a process of subdivision until an appropriate sized chunk * of inline text can be found. * * @param {Node} node * @param {ExtractionContext} context */ function subdivideAndExtractText(node, context) { if (context.shouldStopExtraction()) { return; } switch (determineBlockStatus(node)) { case NodeFilter.FILTER_REJECT: { // This node is rejected as it shouldn't be used for text extraction. return; } // Either a shadow host or a block element case NodeFilter.FILTER_ACCEPT: { const shadowRoot = getShadowRoot(node); if (shadowRoot) { processSubdivide(shadowRoot, context); } else { context.maybeAppendTextContent(node); } break; } case NodeFilter.FILTER_SKIP: { // This node may have text to extract, but it needs to be subdivided into smaller // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes // that contain enough inline elements to extract. processSubdivide(node, context); break; } } } /** * Add qualified nodes to have their text content extracted by recursively walking * through the DOM tree of nodes, including elements in the Shadow DOM. * * @param {Node} node * @param {ExtractionContext} context */ function processSubdivide(node, context) { if (context.shouldStopExtraction()) { return; } const { ownerDocument } = node; if (!ownerDocument) { return; } // This iterator will contain each node that has been subdivided enough to have its // text extracted. const nodeIterator = ownerDocument.createTreeWalker( node, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, determineBlockStatus ); let currentNode; while ((currentNode = nodeIterator.nextNode())) { const shadowRoot = getShadowRoot(currentNode); if (shadowRoot) { processSubdivide(shadowRoot, context); } else { context.maybeAppendTextContent(currentNode); } if (context.shouldStopExtraction()) { return; } } } /** * Returns an iterator of a node's ancestors. * * @param {Node} node * * @yields {Node} */ function* getAncestorsIterator(node) { const document = node.ownerDocument; if (!document) { return; } for ( let parent = node.parentNode; parent && parent !== document.documentElement; parent = parent.parentNode ) { yield parent; } } /** * Reads the elements computed style and determines if the element is a block-like * element or not. Every element that lays out like a block should be used as a unit * for text extraction. * * @param {Node} node * @returns {boolean} */ function getIsBlockLike(node) { const element = asElement(node); if (!element) { return false; } const { ownerGlobal } = element; if (!ownerGlobal) { return false; } if (element.namespaceURI === "http://www.w3.org/2000/svg") { // SVG elements will report as inline, but there is no block layout in SVG. // Treat every SVG element as being block so that every node will be subdivided. return true; } /** @type {Record} */ // @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable. const style = ownerGlobal.getComputedStyle(element) ?? { display: null }; return style.display !== "inline" && style.display !== "none"; } /** * Use TypeScript to determine if the Node is an Element. * * @param {Node | null | undefined} node * @returns {Element | null} */ function asElement(node) { if (node?.nodeType === Node.ELEMENT_NODE) { return /** @type {HTMLElement} */ (node); } return null; } /** * Use TypeScript to determine if the Node is an Element. * * @param {Node | null} node * * @returns {Text | null} */ function asTextNode(node) { if (node?.nodeType === Node.TEXT_NODE) { return /** @type {Text} */ (node); } return null; } /** * Use TypeScript to determine if the Node is an HTMLElement. * * @param {Node | null} node * * @returns {HTMLElement | null} */ function asHTMLElement(node) { if (HTMLElement.isInstance(node)) { return node; } return null; } /** * This function returns the correct element to determine the * style of node. * * @param {Node} node * * @returns {HTMLElement | null} */ function getHTMLElementForStyle(node) { const element = asHTMLElement(node); if (element) { return element; } if (node.parentElement) { return asHTMLElement(node.parentElement); } // For cases like text node where its parent is ShadowRoot, // we'd like to use flattenedTreeParentNode if (node.flattenedTreeParentNode) { return asHTMLElement(node.flattenedTreeParentNode); } // If the text node is not connected or doesn't have a frame. return null; } PK