import { EntityType } from "@/types/enums";

export const CrawlableSiteList = ["linkedin", "nfx", "crunchbase", "vcsheet", "pitchbook"] as const;
export type CrawlableSite = (typeof CrawlableSiteList)[number]; // union of values from CrawlableSiteList

export type CrawlableUrl = {
  /**
   * The original URL that was provided to the parser
   */
  inputUrl: string;
  /**
   * The base profile URL for this entity, like crunchbase.com/organization/name
   */
  profileUrl: string;
  /**
   * The cleaned version of the input URL
   */
  cleanUrl: string;
  /**
   * The profile name for this entity
   */
  profile: string;
  site: CrawlableSite;
  entityType: EntityType;
  subPath?: string;
};

type BaseUrlConfig = {
  filters: Partial<Record<EntityType, string>>;
  addWWWToBaseUrl?: boolean;
};

type SiteUrlConfig = BaseUrlConfig & {
  matchers: Record<EntityType, RegExp>;
};

const BaseConfig: Record<CrawlableSite, BaseUrlConfig> = {
  linkedin: {
    filters: {
      [EntityType.Person]: "linkedin.com/in",
      [EntityType.Company]: "linkedin.com/company",
    },
    addWWWToBaseUrl: true,
  },
  nfx: {
    filters: {
      [EntityType.Person]: "signal.nfx.com/investors",
      [EntityType.Company]: "signal.nfx.com/firms",
    },
  },
  crunchbase: {
    filters: {
      [EntityType.Person]: "crunchbase.com/person",
      [EntityType.Company]: "crunchbase.com/organization",
    },
    addWWWToBaseUrl: true,
  },
  vcsheet: {
    filters: {
      [EntityType.Company]: "vcsheet.com/fund",
    },
  },
  pitchbook: {
    filters: {
      [EntityType.Company]: "pitchbook.com/profiles/company",
    },
  },
};

function regexForBaseUrl(base?: string) {
  if (!base) {
    return undefined;
  }
  return new RegExp(`^https://(?:[\\w-]+\\.)?${base}/([^?#]+)`, "i");
}

export const CrawlableSiteConfig: Record<CrawlableSite, SiteUrlConfig> = Object.freeze(
  Object.fromEntries(
    Object.entries(BaseConfig).map(([key, value]) => [
      key,
      {
        ...value,
        matchers: Object.fromEntries(
          Object.entries(value.filters).map(([entityType, filter]) => [
            entityType,
            regexForBaseUrl(filter),
          ]),
        ),
      },
    ]),
  ),
) as Record<CrawlableSite, SiteUrlConfig>;

/**
 * Parses a URL and returns the crawlable URL if it matches a known site.
 * @param url - The URL to parse.
 * @param entityType - The entity type to filter on.
 * @param siteType - The site type to filter on.
 * @returns The crawlable URL if it matches a known site, otherwise null.
 */
export function parseCrawlableUrl(
  url: string,
  options?: {
    entityType?: EntityType;
    siteType?: CrawlableSite;
  },
): CrawlableUrl | null {
  const { entityType: entityTypeFilter, siteType: siteTypeFilter } = options ?? {};

  if (!url) {
    return null;
  }

  let siteConfigs: [CrawlableSite, SiteUrlConfig?][];
  if (siteTypeFilter) {
    siteConfigs = [[siteTypeFilter, CrawlableSiteConfig[siteTypeFilter]]];
  } else {
    siteConfigs = Object.entries(CrawlableSiteConfig) as [CrawlableSite, SiteUrlConfig][];
  }

  const cleanUrl = url.trim();
  for (const [site, config] of siteConfigs) {
    if (!config) {
      continue;
    }
    const matchingEntityType = findMatchingEntityType(
      cleanUrl,
      config,
      entityTypeFilter ? [entityTypeFilter] : [EntityType.Person, EntityType.Company],
    );
    if (!matchingEntityType) {
      continue;
    }
    if (siteTypeFilter && site !== siteTypeFilter) {
      continue;
    }
    return { inputUrl: url, site, ...matchingEntityType };
  }
  return null;
}

function findMatchingEntityType(
  cleanedUrl: string,
  config: SiteUrlConfig,
  entityTypes: EntityType[],
) {
  for (const entityType of entityTypes) {
    const regex = config.matchers[entityType];
    if (!regex) {
      continue;
    }
    const match = cleanedUrl.match(regex);
    if (!match) {
      continue;
    }

    const filter = config.filters[entityType];
    if (!filter) {
      continue;
    }

    const base = `https://${config.addWWWToBaseUrl ? "www." : ""}${filter}`;

    // Split the path into segments
    const pathSegments = match[1].split("/").filter(Boolean);

    const [profile, ...subPathSegments] = pathSegments;
    const subPath = subPathSegments.length > 0 ? subPathSegments.join("/") : undefined;

    const profileUrl = `${base}/${profile}`;
    const cleanUrl = subPath ? `${profileUrl}/${subPath}` : profileUrl;

    return {
      profileUrl,
      cleanUrl,
      profile,
      subPath,
      entityType,
    };
  }
  return null;
}
