import { loggerWithPrefix } from "@/lib/logger";
import { CrawlableSiteList, parseCrawlableUrl } from "@/lib/sites";

const logger = loggerWithPrefix("[sanitizeLinks]");

const excludedSnippets: string[] = [
  "/signup",
  "/signin",
  "/login",
  "linkedin.com/jobs",
  "linkedin.com/uas",
  "linkedin.com/legal",
  "linkedin.com/learning",
  "linkedin.com/hubs",
  "linkedin.com/directory",
  "linkedin.com/games",
  "linkedin.com/products",
  "linkedin.com/pulse",
  "chrome.google.com",
  "about.crunchbase.com",
  "latka.com",
  "cbinsights.com",
  "founderled.com",
  "/policies",
  "/terms",
  "/privacy",
  "/accessibility",
  "twitter.com/intent",
  "x.com/intent",
  "x.com/search?",
  "x.com/i/flow/login",
  "x.com/i/keyboard_shortcuts",
];

const excludePrefixes = ["#", "/buy"];
const excludeSuffixes = ["news.crunchbase.com/", "/crunchbase"];

export function excludeLink(link: string) {
  return (
    excludedSnippets.some((snippet) => link.includes(snippet)) ||
    excludePrefixes.some((prefix) => link.startsWith(prefix)) ||
    excludeSuffixes.some((suffix) => link.endsWith(suffix))
  );
}

/** clean URLs for the purpose of deduplication. This will remove UTM parameters, redirect URLs, and other non-canonical links */
export function sanitizeLink(
  link: string,
  opts: { replaceWWW?: boolean; allowRelative?: boolean } = {},
): string | null {
  let sanitizedLink: string;
  try {
    // decode any encoded characters
    sanitizedLink = decodeURIComponent(link);
  } catch (e) {
    logger.warn("Error decoding link", link, e);
    return null;
  }

  if (
    sanitizedLink.includes("://") &&
    !sanitizedLink.startsWith("http") &&
    !sanitizedLink.startsWith("mailto:")
  ) {
    return null;
  }

  if (!sanitizedLink.includes("://") && !opts.allowRelative) {
    sanitizedLink = "https://" + sanitizedLink;
  }

  // Remove UTM parameters, trk, and refer
  sanitizedLink = sanitizedLink.replace(/[\?&](cta_[^&]+|utm_[^&]+|.*refer.*|trk|gi)=[^&]+/g, "");

  if (sanitizedLink.startsWith("https://x.com")) {
    const paramsToRemove = ["mx", "lang", "ref_src"];
    paramsToRemove.forEach((param) => {
      sanitizedLink = sanitizedLink.replace(new RegExp(`[\?&]${param}=[^&]+`, "g"), "");
    });
  }

  if (sanitizedLink.includes("&") && !sanitizedLink.includes("?")) {
    sanitizedLink = sanitizedLink.replace("&", "?");
  }

  // remove #blah links
  sanitizedLink = sanitizedLink.replace(/#.*/, "");

  // Check if it's a redirect URL
  const redirectMatch = sanitizedLink.match(/redirect\?url=(.+)/);
  if (redirectMatch) {
    sanitizedLink = decodeURIComponent(redirectMatch[1]);
  }

  // Check if the link starts with any of the ignored prefixes
  const isIgnored = excludeLink(sanitizedLink);
  if (isIgnored) {
    return null;
  }

  if (sanitizedLink.endsWith("/") && !sanitizedLink.includes("?")) {
    sanitizedLink = sanitizedLink.slice(0, sanitizedLink.length - 1);
  }

  if (opts.replaceWWW) {
    sanitizedLink = sanitizedLink.replace("www.", "");
  }

  // If this is a linkedin profile or other CrawlableSite, use the cleaned version
  // otherwise we get country prefixes like uk.linkedin.com etc
  const parsedUrl = parseCrawlableUrl(sanitizedLink);
  if (parsedUrl?.cleanUrl) {
    sanitizedLink = parsedUrl.cleanUrl;
  }

  return sanitizedLink;
}

export function sanitizeLinkForSearch(rawUrl: string): string | null {
  if (!rawUrl) {
    return null;
  }

  const url = rawUrl.replace("//twitter.com/", "//x.com/");

  // Sanitize the URL
  const sanitizedUrl = sanitizeLink(url);
  if (!sanitizedUrl) return null;

  // Check for ignored domains
  if (searchDomainIgnorelist.some((d) => sanitizedUrl.includes(d))) return null;

  return sanitizedUrl;
}

// always ignore links from these domains
export const searchDomainIgnorelist = [
  `youtube.com`,
  `craft.co`,
  `instagram.com`,
  `capterra.com`,
  `getapp.com`,
  `g2.com`,
  `youtube.com`,
  `beamstart.com`,
  `zoominfo.com`,
  `spokeo.com`,
  `rocketreach.co`,
  `peekyou.com`,
  `radaris.com`,
  `linkedin.com/pub/dir`,
  `theorg.com`,
  `contactout.com`,
];

export function sanitizeLinks(links: string[]): string[] {
  const sanitizedLinks = new Set<string>();

  for (const link of links) {
    const sanitizedLink = sanitizeLink(link);
    if (!sanitizedLink) {
      continue;
    }

    if (sanitizedLink.includes("www.")) {
      const nonWWW = sanitizedLink.replace("www.", "");
      if (sanitizedLinks.has(nonWWW)) {
        continue;
      }
    }

    sanitizedLinks.add(sanitizedLink);
  }

  const uniqueSanitizedLinks = Array.from(sanitizedLinks);

  return uniqueSanitizedLinks;
}

export function sanitizeLinksWithItems<T extends { url: string }>(items: T[]): T[] {
  const seenLinks = new Set<string>();
  const result: T[] = [];

  for (const item of items) {
    const sanitizedLink = sanitizeLink(item.url);

    if (!sanitizedLink) {
      continue;
    }

    const nonWWWLink =
      sanitizedLink.includes("www.") ? sanitizedLink.replace("www.", "") : sanitizedLink;

    if (!seenLinks.has(nonWWWLink)) {
      seenLinks.add(nonWWWLink);
      result.push({
        ...item,
        url: sanitizedLink,
      });
    }
  }

  return result;
}

export function normalizeUrl(url: string): string {
  if (!url.includes("://")) {
    url = "https://" + url;
  }

  try {
    const parsed = new URL(url);
    return parsed.origin.replace(/^www\./, "") + parsed.pathname;
  } catch (e) {
    return url.replace(/.*?:\/\//g, "");
  }
}

export function fixRelativeUrl(url: string, parent: URL) {
  try {
    return new URL(url, parent).toString();
  } catch (e) {
    return (
      url.startsWith("/") ? parent.origin + url
      : url.startsWith("./") ? parent.origin + url.slice(1)
      : url
    );
  }
}
