import * as cheerio from "cheerio";
import he from "he";

import { sanitizeLink } from "@/crawler/sanitizeLinks";
import { ElementType } from "domelementtype";
const keepTags = ["h1", "h2", "h3", "h4", "h5", "h6", "table", "code", "pre", "blockquote", "time"];

export function htmlToDom(bodyHTML: string): cheerio.CheerioAPI {
  const dom = cheerio.load(bodyHTML);
  return dom;
}

/**
 * Converts HTML content to plain text while intelligently cleaning and structuring the output.
 * This function performs several key operations:
 * 1. Removes unwanted sections like ads, modals, navigation, and hidden content
 * 2. Preserves specific HTML elements including:
 *    - Headers (h1-h6)
 *    - Tables
 *    - Code blocks and pre tags
 *    - Blockquotes
 *    - Time elements
 *    - Links and images (when keepUrls=true)
 * 3. Only saves specific good attributes for each element
 * 4. Cleans up spacing and formatting for readability
 *
 * @param htmlOrDoc - Either an HTML string or a Cheerio document to process
 * @param options - Configuration options
 * @param options.keepUrls - If true, preserves URLs and links in the output
 * @param options.skipShortBodyCheck - If true, skips the body length check for removing certain elements
 * @returns A cleaned, formatted plain text representation of the HTML content
 */
export function htmlToText(
  htmlOrDoc: cheerio.CheerioAPI | string,
  { keepUrls, skipShortBodyCheck }: { keepUrls?: boolean; skipShortBodyCheck?: boolean } = {},
): string {
  const $ = typeof htmlOrDoc == "string" ? htmlToDom(htmlOrDoc) : htmlOrDoc;

  cleanCommonInteractiveElements($);

  // pre-process by removing unwanted sections
  [
    "head",
    "iframe",
    "style",
    "noscript",
    ".modal",
    ".seo",
    "[ad-id]",
    "#msg-overlay",
    ".post-block--unread", // techcrunch collapsed articles
    "[data-test-id=browse-jobs]",
    "[data-test-id=similar-pages]",
    "[data-test-id=sidebarColumn]",
    "[class*=top-stories]",
    "[data-module=sidebar]",
    ".iris-recommend",
    ".article-bottom-section",
    ".ad-unit",
  ].forEach((tag) => {
    $(tag).remove();
  });

  // Remove script tags, except those with type="application/json"
  $("script").each((_, el) => {
    const $el = $(el);
    if ($el.attr("type") !== "application/json") {
      $el.remove();
    }
  });

  $(".show-more-less-text__text--more").each((_, el) => {
    const element = $(el);
    const prev = element.prev();
    if (prev?.hasClass("show-more-less-text__text--less")) {
      prev.remove();
    }
  });

  $("[data-testid='UserJoinDate']").each((_, el) => {
    const element = $(el);
    element.text("\nJoined Twitter: " + element.text());
  });

  // if the page is pretty small, skip removing further elements
  const bodyLength = $("body").text().length || 1;
  if (skipShortBodyCheck || bodyLength > 1000) {
    [
      "form[action]",
      "[role=navigation]",
      "page-header",
      "page-footer",
      "nav",
      ".footer, #footer",
      ".footer-contacts",
      ".header, #header",
      "code",
      "button",
      "input",
      "textarea",
      "select",
      "ins",
      ".cn-fundraising",
      "header[role=banner]",
      "[aria-label=Trending]",
      "[role=complementary]",
      "[class*=recirc]",
      "[aria-hidden=true]", // this could be too aggressive but in general probably helps
    ].forEach((tag) => {
      const contentLength = $(tag).text().length;
      // only remove the tag if it's less than 30% of the body
      // (this is a heuristic to avoid removing important content)
      if (contentLength && contentLength / bodyLength < 0.3) {
        $(tag).remove();
      }
    });
  }

  const result = htmlElemToText($, $("body"), { keepUrls });

  return result;
}

const blockElems = ["div", "p", "br", "tr", "table", "form", "ul", "ol", "li"];

export function htmlElemToText(
  $: cheerio.CheerioAPI,
  elem: cheerio.Cheerio<cheerio.AnyNode>,
  {
    keepUrls,
    skipDecorateLists,
  }: {
    keepUrls?: boolean;
    /** don't add `-` before <ul> elements */
    skipDecorateLists?: boolean;
  } = {},
): string {
  const linkTags = keepUrls ? ["a", "img", "iframe"] : [];

  // Helper function to process a node and its children
  const processNode = (node: cheerio.Cheerio<cheerio.AnyNode>): string => {
    const lines: string[] = [];
    let hasPrev = false;
    node.contents().each((_, el) => {
      const element = $(el);

      // Process text node
      if (el.type === ElementType.Text) {
        const text = he.decode(element.text());
        if (!uselessContent.has(text.trim().toLowerCase())) {
          lines.push(text);
        }
        return;
      }

      const classNames = element.attr("class")?.split(" ") || [];
      if (
        classNames.includes("hidden") ||
        classNames.includes("hide") ||
        classNames.includes("navigation")
      )
        return;

      // ignore invisible elements
      const displaySetting = element.css("display");
      if (
        displaySetting === "none" ||
        element.css("visibility") === "hidden" ||
        element.css("opacity") === "0"
      )
        return;

      // heuristic: positioned elements are probably not part of the main content
      // -- except for some reason twitter puts their tweets in an absolute div
      const position = element.css("position");
      if (position == "fixed") return;

      // Process element node
      const tagName = (el as unknown as Element).tagName?.toLowerCase();
      if (!tagName) return;

      // Preserve the links and images if keepUrls is true
      if (keepTags.includes(tagName) || linkTags.includes(tagName)) {
        const textContent = processNode(element).trim();
        if (!textContent && tagName != "img") return;
        const goodAttributes =
          keepUrls ?
            ["href", "src", "title", "alt", "width", "height", "aria-label"]
          : ["title", "alt"];
        let newElement = `<${tagName}`;

        // only keep elements with valid links
        let hasLink = false;
        goodAttributes.forEach((attr) => {
          let val = element.attr(attr);
          if (val) {
            if (attr == "href" || attr == "src") {
              val = sanitizeLink(val, { allowRelative: true }) ?? undefined;
              if (val) {
                hasLink = true;
              }
            }
            newElement += ` ${attr}="${val}"`;
          }
        });
        if ((tagName == "a" || tagName == "img") && !hasLink) return;
        newElement += textContent ? `>${textContent}</${tagName}>` : ` />`;
        lines.push(newElement);
        return;
      }

      const contents = processNode(element);

      if (contents.trim() && tagName === "li" && !skipDecorateLists) {
        lines.push("- ");
      }

      const isInlineElem = displaySetting === "inline" || !blockElems.includes(tagName);
      if (hasPrev && !isInlineElem) lines.push("\n");

      // Recurse into the children
      if (contents) lines.push(contents);

      // Add extra text depending on the tag
      if (!isInlineElem) {
        lines.push("\n");
      } else if (tagName === "hr") {
        lines.push("\n---\n");
      } else if (tagName === "td") {
        lines.push("\t");
      }

      hasPrev = true;
    });

    return lines.join("");
  };

  let result = processNode(elem);

  // sanitize unicode
  result = sanitizeUnicode(result);

  // remove blank lines
  result = result.replace(/\n\s*\n/g, "\n");

  // remove extra spaces
  result = result.replace(/ +/g, " ");

  // remove leading and trailing spaces for each line
  result = result.replace(/\n\s+|\s+\n/g, "\n");

  return result.trim();
}

// commonly occuring useless phrases in websites. lowercase.
const uselessContent = new Set([
  "video player",
  "javascript is required",
  "skip to main content",
  "continue reading",
  "join now",
  "welcome back",
  "user agreement",
  "privacy policy",
  "terms of service",
  "cookie policy",
  "gdpr notice",
  "follow",
  "like",
  "get the app",
  "sign up for free",
  "continue with google",
  "continue with facebook",
  "sign in",
  "sign up",
  "sign in with google",
  "sign in with facebook",
  "sign up with google",
  "sign up with facebook",
  "all rights reserved",
  "status is offline",
  "click to upgrade to premium",
  "accept cookies",
  "accept all cookies",
  "cookie settings",
  "cookie preferences",
  "manage cookies",
  "we use cookies",
  "share this article",
  "share this page",
  "share this post",
  "share on",
  "share via",
  "read more",
  "load more",
  "view more",
  "show more",
  "see more",
  "read full article",
  "subscribe to our newsletter",
  "sign up for our newsletter",
  "stay updated",
  "stay connected",
  "follow us",
  "connect with us",
  "related articles",
  "recommended for you",
  "you might also like",
  "popular posts",
  "trending now",
  "sponsored content",
  "advertisement",
  "powered by",
  "back to top",
  "leave a comment",
  "comments",
  "post a comment",
  "write a comment",
]);

export function stripAllTags(html: string, replaceUrls?: boolean): string {
  let output = html
    .replace(/<[^ ][^>]*>?/gm, "")
    .replace(/\s+/g, " ")
    .trim();

  if (replaceUrls) output = output.replace(/https?:\/\/\S+/g, "<url>");

  return output;
}

export const sanitizeUnicode = (str: string) => {
  return (
    str
      .replace(/[\u{D800}-\u{DFFF}]/gu, "")
      .replace(/[\x80-\xFF]/g, "")
      .replace(/\x00/g, "")
      // Add these additional replacements
      .replace(/\u200B/g, "") // Zero-width space
      .replace(/\u200C/g, "") // Zero-width non-joiner
      .replace(/\u200D/g, "") // Zero-width joiner
      .replace(/\u2060/g, "") // Word joiner
      .replace(/\uFEFF/g, "") // Zero-width no-break space
      .replace(/[\u2000-\u200F]/g, " ") // Various spaces and control chars
      .replace(/[\u2028-\u202F]/g, " ") // Line/paragraph separators and spaces
      .replace(/\t+/g, " ") // Replace tabs with spaces
      .replace(/ {2,}/g, " ")
  ); // Replace multiple spaces with single space
};

/**
 * Remove all inline styles and data attributes which often contain tracking/styling info
 * and replace complex formatted dates with simple text
 *
 * Generated by Claude
 */
function cleanCommonInteractiveElements($: cheerio.CheerioAPI): void {
  // Remove all inline styles
  $("[style]").removeAttr("style");

  // Remove data attributes which often contain tracking/styling info
  $("*").each((_, el) => {
    const $el = $(el);
    Object.keys($el.attr() || {}).forEach((attr) => {
      if (
        attr.startsWith("data-") &&
        !attr.includes("id") &&
        !attr.includes("value") &&
        !attr.includes("content")
      ) {
        $el.removeAttr(attr);
      }
    });
  });

  // Replace complex formatted dates with simple text
  $("time, [datetime]").each((_, el) => {
    const $el = $(el);
    const datetime = $el.attr("datetime");
    if (datetime) {
      try {
        const date = new Date(datetime);
        $el.text(date.toISOString().split("T")[0]);
      } catch (e) {
        // Keep original content if parsing fails
      }
    }
  });

  // Clean up tables - tables often have lots of styling/formatting
  $("table").each((_, table) => {
    $(table)
      .find("td, th")
      .each((_, cell) => {
        const $cell = $(cell);
        // Remove all attributes except for basic structural ones
        const cellText = $cell.text().trim();
        $cell.empty().text(cellText);
      });
  });
}
