import { DocumentDefinition } from "@/types";
import { htmlToPlainText } from "@/helpers/TextHelpers";

type DefinitionCollection = {
  definitions: DocumentDefinition[];
};

function removeNewlinesAndCarriageReturnsInParagraphs(html: string): string {
  const pRegex = /<p[^>]*>([\s\S]*?)<\/p>/g;
  const processedHtml = html.replace(pRegex, (match, content) => {
    return "<p>" + content.replace(/\r?\n/g, " ") + "</p>";
  });

  return processedHtml;
}

function removeParagraphsFromTableCells(html): string {
  const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/g;
  const processedHtml = html.replace(tableRegex, (match, content) => {
    return (
      "<table>" + content.replace(/<p[^>]*>([\s\S]*?)<\/p>/g, "$1") + "</table>"
    );
  });

  return processedHtml;
}

export default class DocumentParser {
  documentHtml: string;
  documentText: string;
  documentTextWithoutLineFeeds: string;
  plainText: string;

  constructor(documentHtml: string) {
    this.documentHtml = documentHtml;

    const parser = new DOMParser();
    const htmlForTextConversion = removeParagraphsFromTableCells(
      removeNewlinesAndCarriageReturnsInParagraphs(documentHtml)
    );
    const doc = parser.parseFromString(htmlForTextConversion, "text/html");

    this.documentText = (doc.querySelector("body")?.textContent ?? "").trim();
    this.documentTextWithoutLineFeeds = this.documentText.replace(/\n/g, " ");
    this.plainText = htmlToPlainText(this.documentHtml);
  }

  parse(): DefinitionCollection {
    if (this.hasDefinedDefinition()) {
      return this.definedDefinitions();
    } else {
      const colonDefinitions = this.colonParser();
      const meansOrIsDefinitions = this.meansOrIsParser();
      const quotedDefinitions = this.quotedParser();
      const tableTermsDefinitions = this.tableTermsParser();
      const definitions = [
        colonDefinitions,
        meansOrIsDefinitions,
        quotedDefinitions,
        tableTermsDefinitions
      ];

      const sortedDefinitions = definitions.sort((a, b) => {
        return b.definitions.length - a.definitions.length;
      });
      return sortedDefinitions[0];
    }
  }

  private uniqueArrayIgnoringHyphens(inputArray: string[]): string[] {
    const uniqueArray: string[] = [];
    const normalizedSet = new Set<string>();

    for (const item of inputArray) {
      /* eslint-disable no-useless-escape */
      const normalizedItem = item
        .replace(/[\s-]/g, " ")
        .replace(/['\"“”,\.]/g, "");
      /* eslint-enable no-useless-escape */
      if (!normalizedSet.has(normalizedItem)) {
        uniqueArray.push(item);
        normalizedSet.add(normalizedItem);
      }
    }

    return uniqueArray;
  }

  private colonParser(): DefinitionCollection {
    // https://regex101.com/

    /* eslint-disable-next-line no-useless-escape */
    const regex = new RegExp(/([^\.\r\d\n\[\]:]+:)/, "gm");
    const matches = this.documentText.matchAll(regex);

    const definitions: DocumentDefinition[] = [];
    if (matches) {
      let titles = [];
      for (const match of matches) {
        const foundText = match[1];
        // If there are more capital letters than spaces, it's probably a title
        // This gets messed up when there's a lower case joining word like 'of' or 'and'
        const textToTest = foundText
          .replace(/\sof\s/g, " ")
          .replace(/\sand\s/g, " ");
        const numberOfSpaces = textToTest.split(" ").length - 1;
        const numberOfCapitalLetters = textToTest.replace(/[^A-Z]/g, "").length;
        if (numberOfCapitalLetters > numberOfSpaces) {
          titles.push(foundText.trim());
        }
      }

      titles = this.uniqueArrayIgnoringHyphens(titles); // for uniqueness

      titles.forEach((quotedTitle) => {
        const plainTitle = quotedTitle.replace(/:/g, "");
        if (
          !definitions.find((definition) => definition.title === plainTitle)
        ) {
          definitions.push({
            description: this.getDescription(quotedTitle),
            exactCount: this.exactCountForTitle(plainTitle),
            fuzzyCount: this.fuzzyCountForTitle(plainTitle),
            fuzzyTitles: this.fuzzyTitlesForTitle(plainTitle),
            quotedTitle: quotedTitle,
            title: plainTitle
          });
        }
      });
    }

    return {
      definitions
    };
  }

  private meansOrIsParser(): DefinitionCollection {
    const regex = new RegExp(
      /((?:[A-Za-z]+\s){0,3}[A-Za-z]+)\s(means|is)(\s[^;\n]+;)/,
      "gm"
    );
    const matches = this.plainText.matchAll(regex);
    const definitions: DocumentDefinition[] = [];
    if (matches) {
      let rawDefinitions = [];

      for (const match of matches) {
        /* eslint-disable-next-line no-useless-escape */
        const title = match[1];
        // If the first character is upper case
        if (/^[A-Z]/.test(title)) {
          rawDefinitions.push({
            title: title,
            description: match[2] + match[3]
          });
        }
      }
      rawDefinitions.forEach((definition) => {
        /* eslint-disable-next-line no-useless-escape */

        definitions.push({
          description: definition.description,
          exactCount: this.exactCountForTitle(definition.title),
          fuzzyCount: this.fuzzyCountForTitle(definition.title),
          fuzzyTitles: this.fuzzyTitlesForTitle(definition.title),
          quotedTitle: definition.title,
          title: definition.title
        });
      });
    }
    return { definitions };
  }

  private quotedParser(): DefinitionCollection {
    // https://regex101.com/

    /* eslint-disable-next-line no-useless-escape */
    const regex = new RegExp(/[\s\(](["\'“][^"\'“”]+["\'”])/, "gm");

    const matches = this.documentTextWithoutLineFeeds.matchAll(regex);
    const definitions: DocumentDefinition[] = [];
    if (matches) {
      let titles = [];
      for (const match of matches) {
        /* eslint-disable-next-line no-useless-escape */
        titles.push(match[1]);
      }
      titles = this.uniqueArrayIgnoringHyphens(titles); // for uniqueness
      titles.forEach((quotedTitle) => {
        /* eslint-disable-next-line no-useless-escape */
        const plainTitle = quotedTitle.replace(/['\"“”,\.]/g, "");
        if (
          !definitions.find((definition) => definition.title === plainTitle)
        ) {
          definitions.push({
            description: this.getDescription(quotedTitle),
            exactCount: this.exactCountForTitle(plainTitle),
            fuzzyCount: this.fuzzyCountForTitle(plainTitle),
            fuzzyTitles: this.fuzzyTitlesForTitle(plainTitle),
            quotedTitle: quotedTitle,
            title: plainTitle
          });
        }
      });
    }

    return {
      definitions
    };
  }

  private definedDefinitions(): DefinitionCollection {
    const definitions: DocumentDefinition[] = [];
    const doc = new DOMParser();
    const htmlDoc = doc.parseFromString(this.documentHtml, "text/html");

    const elements = htmlDoc.querySelectorAll(
      ".Definitionunnumbered, .Definition, .Definitionnumbered"
    );
    elements.forEach((element) => {
      const text = element.textContent;
      if (text) {
        /* eslint-disable-next-line no-useless-escape */
        const regex = new RegExp(/[\s\(]?(["\'“][^"\'“”]+["\'”])/, "gm");
        const matches = text.matchAll(regex);
        let quotedTitle = "";
        if (matches) {
          for (const match of matches) {
            quotedTitle = match[1];
          }
        }
        /* eslint-disable-next-line no-useless-escape */
        const plainTitle = quotedTitle.replace(/['\"“”,\.]/g, "");
        if (
          !definitions.find((definition) => definition.title === plainTitle)
        ) {
          const description = text
            .replace(quotedTitle, "")
            .replace(/(\r\n|\n|\r)/gm, " ") // remove newlines
            .replace(/\s+/g, " ") // remove extra spaces
            .trim(); // remove leading and trailing spaces
          definitions.push({
            description: description,
            exactCount: this.exactCountForTitle(plainTitle),
            fuzzyCount: this.fuzzyCountForTitle(plainTitle),
            fuzzyTitles: this.fuzzyTitlesForTitle(plainTitle),
            quotedTitle: quotedTitle,
            title: plainTitle
          });
        }
      }
    });
    return {
      definitions
    };
  }

  private hasDefinedDefinition(): boolean {
    const doc = new DOMParser();
    const htmlDoc = doc.parseFromString(this.documentHtml, "text/html");

    const elements = htmlDoc.querySelectorAll(
      ".Definitionunnumbered, .Definition, .Definitionnumbered"
    );
    return elements.length > 0;
  }

  private tableTermsParser(): DefinitionCollection {
    const definitions: DocumentDefinition[] = [];
    const xpath =
      "//*[translate(normalize-space(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='definitions']/following::table";

    const doc = new DOMParser();
    const xmlDoc = doc.parseFromString(this.documentHtml, "text/html");

    const table = xmlDoc.evaluate(
      xpath,
      xmlDoc,
      null,
      XPathResult.FIRST_ORDERED_NODE_TYPE,
      null
    ).singleNodeValue;
    if (table) {
      const rows = table.getElementsByTagName("tr");
      for (let i = 0; i < rows.length; i++) {
        const row = rows[i];
        const cells = row.getElementsByTagName("td");
        if (cells.length === 2) {
          const title = this.tryTrim(cells[0].textContent);
          const description = this.tryTrim(cells[1].textContent);
          definitions.push({
            description,
            exactCount: this.exactCountForTitle(title),
            fuzzyCount: this.fuzzyCountForTitle(title),
            fuzzyTitles: this.fuzzyTitlesForTitle(title),
            quotedTitle: title,
            title
          });
        }
      }
    }
    return {
      definitions
    };
  }

  private fuzzyCountForTitle(title: string): number {
    const escapedTitle = this.escapeRegExp(title).replace(/[\s-]/, "[\\s-]");
    const regex = new RegExp(`(${escapedTitle})`, "gmi");
    const matches = [...this.documentText.matchAll(regex)];
    return matches.length;
  }

  private fuzzyTitlesForTitle(title: string): { [key: string]: number } {
    const escapedTitle = this.escapeRegExp(title).replace(/[\s-]/, "[\\s-]");
    const regex = new RegExp(`(${escapedTitle})`, "gmi");

    const matches = this.documentText.matchAll(regex);

    const result: { [key: string]: number } = {};
    if (matches) {
      for (const match of matches) {
        const key = match[1].replace("\n", " ").replace("\r", " ").trim();
        if (key !== title) {
          if (result[key]) {
            result[key]++;
          } else {
            result[key] = 1;
          }
        }
      }
    }
    return result;
  }

  private exactCountForTitle(title: string): number {
    const escapedTitle = this.escapeRegExp(title);
    const regex = new RegExp(`(${escapedTitle})`, "gm");
    const matches = [...this.documentTextWithoutLineFeeds.matchAll(regex)];
    return matches.length;
  }

  private getDescription(quotedTitle: string): string {
    const titleForRegex = this.escapeRegExp(quotedTitle);
    /* eslint-disable-next-line no-useless-escape */
    const regex = new RegExp(`[\.]?([^\.\r]*${titleForRegex}[^\.]*\.)`, "gm");
    const matches = this.documentText.matchAll(regex);
    if (matches) {
      for (const match of matches) {
        return match[1].trim();
      }
    }
    return "";
  }

  private escapeRegExp(value: string): string {
    return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
  }

  private tryTrim(value: string | undefined): string {
    if (value) {
      return value.replace(/\\n/g, " ").replace(/\s+/g, " ").trim();
    }
    return "";
  }
}
