import type { User } from "@prisma/client";
import type { JsonObject, JsonValue } from "@prisma/client/runtime/library";
import { CheerioAPI } from "cheerio";

import { UserAgentVariantType } from "@/crawler/headers";
import { ProxyManager } from "@/crawler/proxies";
import { DistillLogger } from "@/lib/logger";
import { truncate } from "@/lib/stringUtils";
import type { TraceContext } from "@/lib/trace";
import { Attribute, LinkedinCompanyProfile, LinkedinProfile } from "@/types/attributes";
import { CookieList, CookieMonster } from "@/types/cookies";
import { EntityType } from "@/types/enums";
import { StructuredScrapeData, StructuredScrapeType } from "@/types/scraping";

export interface CrawlResult {
  url: string;
  requestUrl?: string;
  title: string;
  description?: string;
  body: string;
  links: Link[];
  screenshotUrl?: string;
  favicon?: string;
  lastModified?: Date; // from last-modified headers
  publishDate?: string; // from meta og:publish_date
  socialTags: Record<string, string>;
  structuredData?: JsonObject[];
  rssFeeds?: Link[];
  raw?: string;
  linkedin?: LinkedinProfile;
  linkedinCompany?: LinkedinCompanyProfile;
}

export interface ScrapeResult<T extends StructuredScrapeType>
  extends StructuredScrapeData<T>,
    CrawlResult {}

export interface Crawler {
  options: CrawlerOptions;
  pagesVisited: string[];
  retries?: number;
  userEmail?: string;
  cookies: CookieList;

  run(
    urls: string[],
    onResult: (result: CrawlResult) => void | Promise<void>,
    onError?: (url: string, error: unknown) => void | Promise<void>,
  ): Promise<void>;

  crawlOne(url: string): Promise<CrawlResult>;
}

export type ScrapeResponseData = CheerioAPI | JsonValue;

/**
 * Strongly typed crawler that creates results for StructuredScrapeTypes
 */
export interface Scraper<T extends StructuredScrapeType> extends Crawler {
  run(
    urls: string[],
    onResult: (result: ScrapeResult<T>) => void | Promise<void>,
    onError?: (url: string, error: unknown) => void | Promise<void>,
  ): Promise<void>;

  crawlOne(url: string): Promise<ScrapeResult<T>>;

  /**
   * Creates a crawl result from a record of url to ScrapeResponseData
   */
  createCrawlResult({
    url,
    resultBuffer,
  }: {
    url: string;
    resultBuffer: Record<string, ScrapeResponseData>;
  }): Promise<ScrapeResult<T>>;
}

export const UnstructuredCrawlerType = {
  Cheerio: "cheerio",
  CrunchbaseBatchData: "crunchbase-batch-data",
  CrunchbaseVerifier: "crunchbase-verifier",
  LambdaPlaywright: "playwright-lambda",
  LinkedinCompanyMutuals: "linkedin-company-mutuals",
  LinkedinMutuals: "linkedin-mutuals",
  LinkedinPost: "linkedin-post",
  Serp: "serp",
  Unblocker: "unblocker",
  Web: "web",
} as const;

export const CrawlerType = {
  ...StructuredScrapeType,
  ...UnstructuredCrawlerType,
} as const;

export type CrawlerType = (typeof CrawlerType)[keyof typeof CrawlerType];

export interface CrawlerOptions {
  cookies?: CookieList;
  cookieManagers?: CookieMonster[];
  headers?: Record<string, string>;
  includeRawResult?: boolean;
  preserveContext?: boolean;
  retries?: number;
  screenshot?: boolean;
  skipCache?: boolean;
  skipFallbackCookies?: boolean;
  timeoutMillis?: number;
  traceContext?: TraceContext;
  user?: { email: string } | User;
  userAgent?: string;
  userAgentVariant?: UserAgentVariantType;
  verbose?: boolean;
  proxyManager?: ProxyManager;
  logger?: DistillLogger;
}

export function isCrawlResult(result: unknown): result is CrawlResult {
  const crawlResult = result as CrawlResult;
  return (
    typeof crawlResult?.url === "string" &&
    typeof crawlResult?.body === "string" &&
    Array.isArray(crawlResult?.links) &&
    (crawlResult?.links).every((l) => typeof l?.url === "string")
  );
}

export type CrawlResultSansBody = Omit<CrawlResult, "body">;

export type CrawlEvent = {
  url: string;
  cookies?: CookieList;
  screenshot?: boolean;
  userEmail?: string;
  includeRawResult?: boolean;
  userAgent?: string;
  traceContext?: TraceContext;
};

export type Link = { url: string; title?: string };

export type LinkWithDescription = Link & { description?: string };

export type ImageDetails = {
  src: string | null;
  alt: string | null;
  width: string | null;
  height: string | null;
  linkUrl: string;
  linkTitle: string;
};

export type ExtractedEntity = {
  name: string;
  type: EntityType;
  title?: string;
  description?: string;
  url: string;
  profileImage?: string;
  attribute?: Pick<Attribute, "attributeType" | "value">;
};

export type CrunchbaseData = {
  url: string;
  logo: string;
  name: string;
  "@type": string;
  image: string;
  funder: {
    url: string;
    logo?: string;
    name: string;
    "@type": string;
    image: string;
  }[];
  sameAs: string[];
  address: {
    "@type": string;
    addressRegion: string;
    addressCountry: string;
    addressLocality: string;
  };
  founder: {
    url: string;
    name: string;
    "@type": string;
    image: string;
  }[];
  employee: {
    url: string;
    name: string;
    "@type": string;
    image: string;
    jobTitle: string;
  }[];
  legalName: string;
  description: string;
  foundingDate: string;
};

// Titles should be short, this is safe upper bound.
const MAX_TITLE_LENGTH = 200;

export function getSanitizedTitle(crawlResult: { title?: string } | undefined): string | undefined {
  if (!crawlResult || !crawlResult.title) return undefined;
  return truncate(crawlResult.title, MAX_TITLE_LENGTH);
}

// Descriptions should be short and most social media platforms only show the first 200 characters.
const MAX_DESCRIPTION_LENGTH = 200;

export function getSanitizedDescription(
  crawlResult: { description?: string } | undefined,
): string | undefined {
  if (!crawlResult || !crawlResult.description) return undefined;
  return truncate(crawlResult.description, MAX_DESCRIPTION_LENGTH);
}
