/**
   * Set of nodes that have already been processed, used to avoid duplicating text extraction.
   *
   * @type {Set<Node>}
   */
  #processedNodes = new Set();

  /**
   * The text-extraction options, provided at initialization.
   *
   * @type {GetTextOptions}
   */
  #options;

  /**
   * The accumulated text content that has been extracted from the DOM.
   *
   * @type {string}
   */
  #textContent = "";

  /**
   * When extracting content just from the viewport, this value will be set.
   *
   * @type {{ top: number; left: number; right: number; bottom: number } | null}
   */
  #viewportRect = null;

  /**
   * Constructs a new extraction context with the provided options.
   *
   * @param {Document} document
   * @param {GetTextOptions} options
   */
  constructor(document, options) {
    this.#options = options;

    if (options.justViewport) {
      const { visualViewport } = document.defaultView;
      const { offsetTop, offsetLeft, width, height } = visualViewport;
      this.#viewportRect = {
        top: offsetTop,
        left: offsetLeft,
        right: offsetLeft + width,
        bottom: offsetTop + height,
      };
    }
  }

  /**
   * Accumulated text content produced during traversal.
   *
   * @returns {string}
   */
  get textContent() {
    return this.#textContent;
  }

  /**
   * Returns true if a condition has been met such that the text
   * extraction should stop early, otherwise false.
   *
   * @returns {boolean}
   */
  shouldStopExtraction() {
    const { sufficientLength } = this.#options;

    if (
      sufficientLength !== undefined &&
      this.#textContent.length >= sufficientLength
    ) {
      return true;
    }

    return false;
  }

  /**
   * Returns true if this node or its ancestor's text content has
   * already been extracted from the DOM.
   *
   * @param {Node} node
   */
  #isNodeProcessed(node) {
    if (this.#processedNodes.has(node)) {
      return true;
    }

    for (const ancestor of getAncestorsIterator(node)) {
      if (this.#processedNodes.has(ancestor)) {
        return true;
      }
    }
    return false;
  }

  /**
   * When capturing content only in the viewport, skip nodes that are outside of it.
   *
   * @param {Node} node
   */
  maybeOutOfViewport(node) {
    if (!this.#viewportRect) {
      // We don't have a viewport rect, so skip this check.
      return false;
    }
    const element = getHTMLElementForStyle(node);
    if (!element) {
      return false;
    }

    const rect = element.getBoundingClientRect();
    if (!rect) {
      return false;
    }

    return (
      rect.bottom <= this.#viewportRect.top ||
      rect.top >= this.#viewportRect.bottom ||
      rect.right <= this.#viewportRect.left ||
      rect.left >= this.#viewportRect.right
    );
  }

  /**
   * Append the node's text content to the accumulated text only if the node
   * itself as well as no ancestor of the node has already been processed.
   *
   * @param {Node} node
   */
  maybeAppendTextContent(node) {
    if (this.#isNodeProcessed(node)) {
      return;
    }

    this.#processedNodes.add(node);

    if (isNodeHidden(node)) {
      return;
    }

    if (this.maybeOutOfViewport(node)) {
      // This only can return true when we're capturing just the viewport nodes.
      return;
    }

    const element = asHTMLElement(node);
    const text = asTextNode(node);
    let innerText = "";

    if (element) {
      innerText = element.innerText.trim();
    } else if (text?.nodeValue) {
      innerText = text.nodeValue.trim();
    }

    if (innerText) {
      this.#textContent += "\n" + innerText;
    }
  }
}

/**
 * Extracts visible text content from the DOM.
 *
 * By default, this extracts content from the entire page.
 *
 * Callers may specify filters for the extracted text via
 * the supported options @see {GetTextOptions}.
 *
 * @param {Document} document
 * @param {GetTextOptions} options
 *
 * @returns {string}
 */
export function extractTextFromDOM(document, options) {
  const context = new ExtractionContext(document, options);

  subdivideAndExtractText(document.body, context);

  return context.textContent.trim();
}

/**
 * Tags excluded from text extraction.
 */
const CONTENT_EXCLUDED_TAGS = new Set([
  // TODO - We should add this and write some tests.
  "CODE",

  // The following are deprecated tags.
  "DIR",
  "APPLET",

  // The following are embedded elements, and are not supported (yet).
  "MATH",
  "EMBED",
  "OBJECT",
  "IFRAME",

  // This is an SVG tag that can contain arbitrary XML, ignore it.
  "METADATA",

  // These are elements that are treated as opaque by Firefox which causes their
  // innerHTML property to be just the raw text node behind it. Any text that is sent as
  // HTML must be valid, and there is no guarantee that the innerHTML is valid.
  "NOSCRIPT",
  "NOEMBED",
  "NOFRAMES",

  // Do not parse the HEAD tag.
  "HEAD",

  // These are not user-visible tags.
  "STYLE",
  "SCRIPT",
  "TEMPLATE",
]);

const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(",");

/**
 * Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API.
 * This allows for extracting the content from WebComponents, which is not
 * normally feasible in non-privileged contexts.
 *
 * @param {Node} node
 *
 * @returns {ShadowRoot | null}
 */
function getShadowRoot(node) {
  return asElement(node)?.openOrClosedShadowRoot ?? null;
}

/**
 * Determines if a node is ready for text extraction, or if it should be subdivided
 * further. It doesn't check if the node has already been processed. This id done
 * at the block level.
 *
 * @param {Node} node
 * @returns {number} - NodeFilter acceptance status.
 */
function determineBlockStatus(node) {
  if (!node) {
    return NodeFilter.FILTER_REJECT;
  }
  if (getShadowRoot(node)) {
    return NodeFilter.FILTER_ACCEPT;
  }

  if (isExcludedNode(node)) {
    // This is an explicit.
    return NodeFilter.FILTER_REJECT;
  }

  if (
    containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) &&
    !hasNonWhitespaceTextNodes(node)
  ) {
    // Skip this node, and dig deeper into its tree to cut off smaller pieces to extract.
    return NodeFilter.FILTER_SKIP;
  }

  if (nodeNeedsSubdividing(node)) {
    // Skip this node, and dig deeper into its tree to cut off smaller pieces
    // to extract. It is presumed to be a wrapper of block elements.
    return NodeFilter.FILTER_SKIP;
  }

  // This textContent call is fairly expensive.
  if (!node.textContent?.trim().length) {
    // Do not use subtrees that are empty of text.
    return !node.hasChildNodes()
      ? NodeFilter.FILTER_REJECT
      : NodeFilter.FILTER_SKIP;
  }

  // This node can be treated as entire block and is ready for text extraction.
  return NodeFilter.FILTER_ACCEPT;
}
/**
 * Determine if this element is an inline element or a block element.
 *
 * @param {Node} node
 * @returns {boolean}
 */
function nodeNeedsSubdividing(node) {
  const element = asElement(node);
  if (!element) {
    // Only elements need to be further subdivided.
    return false;
  }

  for (let childNode of element.childNodes) {
    if (!childNode) {
      continue;
    }
    switch (childNode.nodeType) {
      case Node.TEXT_NODE: {
        // Keep checking for more inline or text nodes.
        continue;
      }
      case Node.ELEMENT_NODE: {
        if (getIsBlockLike(childNode)) {
          // This node is a block node, so it needs further subdividing.
          return true;
        } else if (nodeNeedsSubdividing(childNode)) {
          // This non-block-like node may contain other block-like nodes.
          return true;
        }

        // Keep checking for more inline or text nodes.
        continue;
      }
      default: {
        return true;
      }
    }
  }
  return false;
}

/**
 * Returns true if a node is hidden based on factors such as collapsed state and
 * computed style, otherwise false.
 *
 * @param {Node} node
 * @returns {boolean}
 */
function isNodeHidden(node) {
  const element = getHTMLElementForStyle(node);

  if (!element) {
    return true;
  }

  // This is a cheap and easy check that will not compute style or force reflow.
  if (element.hidden) {
    // The element is explicitly hidden.
    return true;
  }

  // Handle open/closed <details> elements. This will also not compute style or force reflow.
  // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/details
  if (
    // The element is within a closed <details>
    element.closest("details:not([open])") &&
    // The element is not part of the <summary> of the <details>, which is always visible, even when closed.
    !element.closest("summary")
  ) {
    // The element is within a closed <details> and is not part of the <summary>, therefore it is not visible.
    return true;
  }

  // This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible.
  // https://github.com/jquery/jquery/blob/bd6b453b7effa78b292812dbe218491624994526/src/css/hiddenVisibleSelectors.js#L1-L10
  if (
    !(
      element.offsetWidth ||
      element.offsetHeight ||
      element.getClientRects().length
    )
  ) {
    return true;
  }

  // The element may still have a zero-sized bounding client rectangle.
  const boundingClientRect = element.getBoundingClientRect();
  if (
    boundingClientRect &&
    (boundingClientRect.width === 0 || boundingClientRect.height === 0)
  ) {
    return true;
  }

  const { ownerGlobal } = element;
  if (!ownerGlobal) {
    // We cannot compute the style without ownerGlobal, so we will assume it is not visible.
    return true;
  }

  // This flushes the style, which is a performance cost.
  const style = ownerGlobal.getComputedStyle(element);
  if (!style) {
    // We were unable to compute the style, so we will assume it is not visible.
    return true;
  }

  // This is an issue with the DOM library generation.
  const { display, visibility, opacity } = style;

  return (
    display === "none" ||
    visibility === "hidden" ||
    visibility === "collapse" ||
    opacity === "0"
  );
}

/**
 * @param {Node} node
 */
function isExcludedNode(node) {
  // Property access be expensive, so destructure required properties so they are
  // not accessed multiple times.
  const { nodeType } = node;

  if (nodeType === Node.TEXT_NODE) {
    // Text nodes are never excluded.
    return false;
  }
  const element = asElement(node);
  if (!element) {
    // Only elements and and text nodes should be considered.
    return true;
  }

  const { nodeName } = element;

  if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) {
    // SVG tags can be lowercased, so ensure everything is uppercased.
    // This is an excluded tag.
    return true;
  }

  return false;
}

/**
 * Like `#isExcludedNode` but looks at the full subtree. Used to see whether
 * we can consider a subtree, or whether we should split it into smaller
 * branches first to try to exclude more of the content.
 *
 * @param {Node} node
 * @param {string} excludedNodeSelector
 *
 * @returns {boolean}
 */
function containsExcludedNode(node, excludedNodeSelector) {
  return Boolean(asElement(node)?.querySelector(excludedNodeSelector));
}

/**
 * Test whether any of the direct child text nodes of are non-whitespace text nodes.
 *
 * For example:
 *   - `<p>test</p>`: yes
 *   - `<p> </p>`: no
 *   - `<p><b>test</b></p>`: no
 *
 * @param {Node} node
 *
 * @returns {boolean}
 */
function hasNonWhitespaceTextNodes(node) {
  if (node.nodeType !== Node.ELEMENT_NODE) {
    // Only check element nodes.
    return false;
  }

  for (const child of node.childNodes) {
    const textNode = asTextNode(child);
    if (textNode) {
      if (!textNode.textContent?.trim()) {
        // This is just whitespace.
        continue;
      }
      // A text node with content was found.
      return true;
    }
  }

  // No text nodes were found.
  return false;
}

/**
 * Start walking down through a node's subtree and decide which nodes to extract content
 * from. This first node is the root of the page.
 *
 * The nodes go through a process of subdivision until an appropriate sized chunk
 * of inline text can be found.
 *
 * @param {Node} node
 * @param {ExtractionContext} context
 */
function subdivideAndExtractText(node, context) {
  if (context.shouldStopExtraction()) {
    return;
  }

  switch (determineBlockStatus(node)) {
    case NodeFilter.FILTER_REJECT: {
      // This node is rejected as it shouldn't be used for text extraction.
      return;
    }

    // Either a shadow host or a block element
    case NodeFilter.FILTER_ACCEPT: {
      const shadowRoot = getShadowRoot(node);
      if (shadowRoot) {
        processSubdivide(shadowRoot, context);
      } else {
        context.maybeAppendTextContent(node);
      }
      break;
    }

    case NodeFilter.FILTER_SKIP: {
      // This node may have text to extract, but it needs to be subdivided into smaller
      // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes
      // that contain enough inline elements to extract.
      processSubdivide(node, context);
      break;
    }
  }
}

/**
 * Add qualified nodes to have their text content extracted by recursively walking
 * through the DOM tree of nodes, including elements in the Shadow DOM.
 *
 * @param {Node} node
 * @param {ExtractionContext} context
 */
function processSubdivide(node, context) {
  if (context.shouldStopExtraction()) {
    return;
  }

  const { ownerDocument } = node;
  if (!ownerDocument) {
    return;
  }

  // This iterator will contain each node that has been subdivided enough to have its
  // text extracted.
  const nodeIterator = ownerDocument.createTreeWalker(
    node,
    NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
    determineBlockStatus
  );

  let currentNode;
  while ((currentNode = nodeIterator.nextNode())) {
    const shadowRoot = getShadowRoot(currentNode);
    if (shadowRoot) {
      processSubdivide(shadowRoot, context);
    } else {
      context.maybeAppendTextContent(currentNode);
    }
    if (context.shouldStopExtraction()) {
      return;
    }
  }
}

/**
 * Returns an iterator of a node's ancestors.
 *
 * @param {Node} node
 *
 * @yields {Node}
 */
function* getAncestorsIterator(node) {
  const document = node.ownerDocument;
  if (!document) {
    return;
  }
  for (
    let parent = node.parentNode;
    parent && parent !== document.documentElement;
    parent = parent.parentNode
  ) {
    yield parent;
  }
}

/**
 * Reads the elements computed style and determines if the element is a block-like
 * element or not. Every element that lays out like a block should be used as a unit
 * for text extraction.
 *
 * @param {Node} node
 * @returns {boolean}
 */
function getIsBlockLike(node) {
  const element = asElement(node);
  if (!element) {
    return false;
  }

  const { ownerGlobal } = element;
  if (!ownerGlobal) {
    return false;
  }

  if (element.namespaceURI === "http://www.w3.org/2000/svg") {
    // SVG elements will report as inline, but there is no block layout in SVG.
    // Treat every SVG element as being block so that every node will be subdivided.
    return true;
  }

  /** @type {Record<string, string>} */
  // @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable.
  const style = ownerGlobal.getComputedStyle(element) ?? { display: null };

  return style.display !== "inline" && style.display !== "none";
}

/**
 * Use TypeScript to determine if the Node is an Element.
 *
 * @param {Node | null | undefined} node
 * @returns {Element | null}
 */
function asElement(node) {
  if (node?.nodeType === Node.ELEMENT_NODE) {
    return /** @type {HTMLElement} */ (node);
  }
  return null;
}

/**
 * Use TypeScript to determine if the Node is an Element.
 *
 * @param {Node | null} node
 *
 * @returns {Text | null}
 */
function asTextNode(node) {
  if (node?.nodeType === Node.TEXT_NODE) {
    return /** @type {Text} */ (node);
  }
  return null;
}

/**
 * Use TypeScript to determine if the Node is an HTMLElement.
 *
 * @param {Node | null} node
 *
 * @returns {HTMLElement | null}
 */
function asHTMLElement(node) {
  if (HTMLElement.isInstance(node)) {
    return node;
  }
  return null;
}

/**
 * This function returns the correct element to determine the
 * style of node.
 *
 * @param {Node} node
 *
 * @returns {HTMLElement | null}
 */
function getHTMLElementForStyle(node) {
  const element = asHTMLElement(node);
  if (element) {
    return element;
  }

  if (node.parentElement) {
    return asHTMLElement(node.parentElement);
  }

  // For cases like text node where its parent is ShadowRoot,
  // we'd like to use flattenedTreeParentNode
  if (node.flattenedTreeParentNode) {
    return asHTMLElement(node.flattenedTreeParentNode);
  }

  // If the text node is not connected or doesn't have a frame.
  return null;
}
PK