import { remove as removeDiacritics } from "diacritics";
import soundex from "talisman/phonetics/soundex";

// french stopWords from https://github.com/fergiemcdowall/stopword/blob/master/lib/stopwords_fr.js
export const stopWordsFr = [
  "être",
  "avoir",
  "faire",
  "a",
  "au",
  "aux",
  "avec",
  "ce",
  "ces",
  "dans",
  "de",
  "des",
  "du",
  "elle",
  "en",
  "et",
  "eux",
  "il",
  "je",
  "la",
  "le",
  "leur",
  "lui",
  "ma",
  "mais",
  "me",
  "même",
  "mes",
  "moi",
  "mon",
  "ne",
  "nos",
  "notre",
  "nous",
  "on",
  "ou",
  "où",
  "par",
  "pas",
  "pour",
  "qu",
  "que",
  "qui",
  "sa",
  "se",
  "ses",
  "son",
  "sur",
  "ta",
  "te",
  "tes",
  "toi",
  "ton",
  "tu",
  "un",
  "une",
  "vos",
  "votre",
  "vous",
  "c",
  "d",
  "j",
  "l",
  "à",
  "m",
  "n",
  "s",
  "t",
  "y",
  "été",
  "étée",
  "étées",
  "étés",
  "étant",
  "suis",
  "es",
  "est",
  "sommes",
  "êtes",
  "sont",
  "serai",
  "seras",
  "sera",
  "serons",
  "serez",
  "seront",
  "serais",
  "serait",
  "serions",
  "seriez",
  "seraient",
  "étais",
  "était",
  "étions",
  "étiez",
  "étaient",
  "fus",
  "fut",
  "sois",
  "soit",
  "soyons",
  "soyez",
  "soient",
  "fût",
  "ayant",
  "eu",
  "eue",
  "eues",
  "eus",
  "ai",
  "as",
  "avons",
  "avez",
  "ont",
  "aurai",
  "auras",
  "aura",
  "aurons",
  "aurez",
  "auront",
  "aurais",
  "aurait",
  "aurions",
  "auriez",
  "auraient",
  "avais",
  "avait",
  "avions",
  "aviez",
  "avaient",
  "eut",
  "aie",
  "aies",
  "ait",
  "ayons",
  "ayez",
  "aient",
  "ceci",
  "cela",
  "cet",
  "cette",
  "ici",
  "ils",
  "les",
  "leurs",
  "quel",
  "quels",
  "quelle",
  "quelles",
  "sans",
  "soi"
];

// eslint-disable-next-line @typescript-eslint/no-explicit-any
export function phonem(token: any): string {
  return soundex.refined(token).toLowerCase();
}

export function getLanguage(packageId: string): string {
  const regex = /^[a-zA-Z0-9-]+[_](..)/gm;
  const result = regex.exec(packageId);
  if (!(result && result[1])) {
    console.warn("Couldn't get the package language. Setting to default (en)");
    return "en";
  }
  return result && result[1];
}

// version2.9
// to keep consistent between importer / viewer
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export function handleFrench(elasticlunr: any) {
  // Example : A lot of english content is still there in the AS350_FR (WDM)
  elasticlunr.addStopWords(stopWordsFr);
  elasticlunr.tokenizer.setSeperator(/[\s,'’]+/); // DONE : In french, take care of ' or ’ separator (tokenizer)
  // => Some words like l'appareil or d'aluminium were indexed in the wrong letter (l and d)

  const customTrimmer = (token: string) => {
    // Done: Original trimmer is messing up some french words like étanchiété => tanchiét
    // => do not consider common french diacritics as non-letter characters (fix trimmer)
    if (token === null || token === undefined) {
      throw new Error("token should not be undefined");
    }
    return token.replace(/^[^a-zA-Z0-9_êéèàùôç]+/, "").replace(/[^a-zA-Z0-9_êéèàùôç]+$/, "");
  };
  elasticlunr.trimmer = customTrimmer;
  elasticlunr.Pipeline.registerFunction(customTrimmer, "trimmer");
}

// To keep consistent between importer and viewer //
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export function phonemFilter(token: any, lang: string) {
  if (/\d/.test(token) || token.length < 5) {
    // if token contains some numbers or is too small
    if (lang !== "en") {
      return removeDiacritics(token);
    } else {
      return token;
    }
  } else {
    return phonem(token);
  }
}
