(ns com.vadelabs.utils.text
  (:require
    [clj-fuzzy.metrics :as fm]
    [clj-fuzzy.phonetics :as ph]
    [com.vadelabs.utils.str :as u.str]))


(def stop-words
  {"a" true
   "about" true
   "above" true
   "across" true
   "after" true
   "again" true
   "against" true
   "all" true
   "almost" true
   "alone" true
   "along" true
   "already" true
   "also" true
   "although" true
   "always" true
   "am" true
   "among" true
   "an" true
   "and" true
   "another" true
   "any" true
   "anybody" true
   "anyone" true
   "anything" true
   "anywhere" true
   "are" true
   "area" true
   "areas" true
   "aren't" true
   "around" true
   "as" true
   "ask" true
   "asked" true
   "asking" true
   "asks" true
   "at" true
   "away" true
   "b" true
   "back" true
   "backed" true
   "backing" true
   "backs" true
   "be" true
   "became" true
   "because" true
   "become" true
   "becomes" true
   "been" true
   "before" true
   "began" true
   "behind" true
   "being" true
   "beings" true
   "below" true
   "best" true
   "better" true
   "between" true
   "big" true
   "both" true
   "but" true
   "by" true
   "c" true
   "came" true
   "can" true
   "can't" true
   "cannot" true
   "case" true
   "cases" true
   "certain" true
   "certainly" true
   "clear" true
   "clearly" true
   "come" true
   "could" true
   "couldn't" true
   "d" true
   "did" true
   "didn't" true
   "differ" true
   "different" true
   "differently" true
   "do" true
   "does" true
   "doesn't" true
   "doing" true
   "don" true
   "don't" true
   "done" true
   "down" true
   "downed" true
   "downing" true
   "downs" true
   "during" true
   "e" true
   "each" true
   "early" true
   "either" true
   "end" true
   "ended" true
   "ending" true
   "ends" true
   "enough" true
   "even" true
   "evenly" true
   "ever" true
   "every" true
   "everybody" true
   "everyone" true
   "everything" true
   "everywhere" true
   "f" true
   "face" true
   "faces" true
   "fact" true
   "facts" true
   "far" true
   "felt" true
   "few" true
   "find" true
   "finds" true
   "first" true
   "for" true
   "four" true
   "from" true
   "full" true
   "fully" true
   "further" true
   "furthered" true
   "furthering" true
   "furthers" true
   "g" true
   "gave" true
   "general" true
   "generally" true
   "get" true
   "gets" true
   "give" true
   "given" true
   "gives" true
   "go" true
   "going" true
   "good" true
   "goods" true
   "got" true
   "great" true
   "greater" true
   "greatest" true
   "group" true
   "grouped" true
   "grouping" true
   "groups" true
   "h" true
   "had" true
   "hadn't" true
   "has" true
   "hasn't" true
   "have" true
   "haven't" true
   "having" true
   "he" true
   "he'd" true
   "he'll" true
   "he's" true
   "her" true
   "here" true
   "here's" true
   "hers" true
   "herself" true
   "high" true
   "higher" true
   "highest" true
   "him" true
   "himself" true
   "his" true
   "how" true
   "how's" true
   "however" true
   "i" true
   "i'd" true
   "i'll" true
   "i'm" true
   "i've" true
   "if" true
   "important" true
   "in" true
   "interest" true
   "interested" true
   "interesting" true
   "interests" true
   "into" true
   "is" true
   "isn't" true
   "it" true
   "it's" true
   "its" true
   "itself" true
   "j" true
   "just" true
   "k" true
   "keep" true
   "keeps" true
   "kind" true
   "knew" true
   "know" true
   "known" true
   "knows" true
   "l" true
   "large" true
   "largely" true
   "last" true
   "later" true
   "latest" true
   "least" true
   "less" true
   "let" true
   "let's" true
   "lets" true
   "like" true
   "likely" true
   "long" true
   "longer" true
   "longest" true
   "m" true
   "made" true
   "make" true
   "making" true
   "man" true
   "many" true
   "may" true
   "me" true
   "member" true
   "members" true
   "men" true
   "might" true
   "more" true
   "most" true
   "mostly" true
   "mr" true
   "mrs" true
   "much" true
   "must" true
   "mustn't" true
   "my" true
   "myself" true
   "n" true
   "necessary" true
   "need" true
   "needed" true
   "needing" true
   "needs" true
   "never" true
   "new" true
   "newer" true
   "newest" true
   "next" true
   "no" true
   "nobody" true
   "non" true
   "noone" true
   "nor" true
   "not" true
   "nothing" true
   "now" true
   "nowhere" true
   "number" true
   "numbers" true
   "o" true
   "of" true
   "off" true
   "often" true
   "old" true
   "older" true
   "oldest" true
   "on" true
   "once" true
   "one" true
   "only" true
   "open" true
   "opened" true
   "opening" true
   "opens" true
   "or" true
   "order" true
   "ordered" true
   "ordering" true
   "orders" true
   "other" true
   "others" true
   "ought" true
   "our" true
   "ours" true
   "ourselves" true
   "out" true
   "over" true
   "own" true
   "p" true
   "part" true
   "parted" true
   "parting" true
   "parts" true
   "per" true
   "perhaps" true
   "place" true
   "places" true
   "point" true
   "pointed" true
   "pointing" true
   "points" true
   "possible" true
   "present" true
   "presented" true
   "presenting" true
   "presents" true
   "problem" true
   "problems" true
   "put" true
   "puts" true
   "q" true
   "quite" true
   "r" true
   "rather" true
   "really" true
   "right" true
   "room" true
   "rooms" true
   "s" true
   "said" true
   "same" true
   "saw" true
   "say" true
   "says" true
   "second" true
   "seconds" true
   "see" true
   "seem" true
   "seemed" true
   "seeming" true
   "seems" true
   "sees" true
   "several" true
   "shall" true
   "shan't" true
   "she" true
   "she'd" true
   "she'll" true
   "she's" true
   "should" true
   "shouldn't" true
   "show" true
   "showed" true
   "showing" true
   "shows" true
   "side" true
   "sides" true
   "since" true
   "small" true
   "smaller" true
   "smallest" true
   "so" true
   "some" true
   "somebody" true
   "someone" true
   "something" true
   "somewhere" true
   "state" true
   "states" true
   "still" true
   "such" true
   "sure" true
   "t" true
   "take" true
   "taken" true
   "than" true
   "that" true
   "that's" true
   "the" true
   "their" true
   "theirs" true
   "them" true
   "themselves" true
   "then" true
   "there" true
   "there's" true
   "therefore" true
   "these" true
   "they" true
   "they'd" true
   "they'll" true
   "they're" true
   "they've" true
   "thing" true
   "things" true
   "think" true
   "thinks" true
   "this" true
   "those" true
   "though" true
   "thought" true
   "thoughts" true
   "three" true
   "through" true
   "thus" true
   "to" true
   "today" true
   "together" true
   "too" true
   "took" true
   "toward" true
   "turn" true
   "turned" true
   "turning" true
   "turns" true
   "two" true
   "u" true
   "under" true
   "until" true
   "up" true
   "upon" true
   "us" true
   "use" true
   "used" true
   "uses" true
   "v" true
   "very" true
   "w" true
   "want" true
   "wanted" true
   "wanting" true
   "wants" true
   "was" true
   "wasn't" true
   "way" true
   "ways" true
   "we" true
   "we'd" true
   "we'll" true
   "we're" true
   "we've" true
   "well" true
   "wells" true
   "went" true
   "were" true
   "weren't" true
   "what" true
   "what's" true
   "when" true
   "when's" true
   "where" true
   "where's" true
   "whether" true
   "which" true
   "while" true
   "who" true
   "who's" true
   "whole" true
   "whom" true
   "whose" true
   "why" true
   "why's" true
   "will" true
   "with" true
   "within" true
   "without" true
   "won't" true
   "work" true
   "worked" true
   "working" true
   "works" true
   "would" true
   "wouldn't" true
   "x" true
   "y" true
   "year" true
   "years" true
   "yet" true
   "you" true
   "you'd" true
   "you'll" true
   "you're" true
   "you've" true
   "young" true
   "younger" true
   "youngest" true
   "your" true
   "yours" true
   "yourself" true
   "yourselves" true
   "z" true})


(defn prep-string
  [s]
  (-> s
    (u.str/replace "\n" " ")
    (u.str/replace #"[^A-Za-z0-9 ]" "")
    (u.str/lower)))


(defn valid-word?
  [s]
  (let [st (u.str/trim (u.str/replace s "\n" " "))]
    (and
      (not (stop-words (u.str/lower st)))
      (> (count st) 1))))


(declare prep-string-coll)


(defn string-vec-helper
  [s]
  (let [str-coll (u.str/split s #" ")]
    (if (> (count str-coll) 1)
      (prep-string-coll str-coll)
      s)))


(defn prep-string-coll
  "Creates a collection of prepped and valid words. Input is a collection of strings.
   Users may provide their own `valid-word-fn`."
  [str-coll & valid-word-fn]
  (flatten (remove nil? (map #(when (if (first valid-word-fn) ((first valid-word-fn) %) (valid-word? %))
                                (string-vec-helper (prep-string %)))
                          str-coll))))





(defn tokenize
  "Converts value `x` to tokens with the provided `tokenizers`. `tokenizers` is a
  seq of functions that take a single value and return a seq of tokens. The type
  of value `x` and the produced tokens are arbitrary and up to the user, but
  tokenizers must compose. Built-in tokenizers mostly only work with strings for
  `x` (some accept keywords) and all produce a sequence of strings."
  [x & [tokenizers]]
  (reduce
    (fn [tokens f] (mapcat f tokens))
    (remove nil? (if (coll? x) x [x]))
    (or tokenizers [vector])))


(def sep-re #"[/\.,_\-\?!\s\n\r\(\)\[\]:]+")


(defn tokenize-lower-case
  "Converts a string to a single lower case token"
  [s]
  [(u.str/lower (u.str/trim s))])


(defn remove-diacritics
  "Converts a string to a single token with all combining diacritis removed: é
  becomes e, å becomes a, etc."
  [s]
  [(-> (u.str/trim s)
     #?(:cljs (.normalize "NFD"))
     (u.str/replace #"[\u0300-\u036f]" "")
     u.str/lower)])


(defn duplicate-diacritics
  "Turns strings with combining diacritics into two separate tokens: one with
  diacritics removed, and the original string. Strings without combining
  diacritics are returned as a single normalized token."
  [s]
  (let [normalized (.normalize s "NFD")
        stripped (u.str/replace normalized #"[\u0300-\u036f]" "")]
    (if (not= stripped normalized)
      [stripped normalized]
      [normalized])))


(defn tokenize-words
  "Converts a string to a sequence of word tokens, removing punctuation."
  [s]
  (filter not-empty (u.str/split s sep-re)))


(defn tokenize-ngrams
  "Converts a string to ngram tokens. When only one number is passed, only that
  sized ngrams are produced, otherwise, every length ngram from `min-n` to
  `max-n` is produced.

  ```clj
  (tokenize-ngrams 1 2 \"Hello\") ;;=> (\"H\" \"e\" \"l\" \"l\" \"o\"
                                  ;;    \"He\" \"el\" \"ll\" \"lo\")
  ```"
  ([n word]
    (tokenize-ngrams n n word))
  ([min-n max-n word]
    (->> (for [n (range min-n (inc max-n))]
           (->> word
             (partition n 1)
             (map u.str/join)))
      (apply concat))))


(defn tokenize-edge-ngrams
  "Converts a string to ngram tokens from the beginning of the string.
  When only one number is passed, only that sized ngrams are produced,
  otherwise, every length ngram from `min-n` to `max-n` is produced.

  ```clj
  (tokenize-edge-ngrams 1 5 \"Hello\") ;;=> (\"H\" \"He\" \"Hel\" \"Hell\" \"Hello\")
  ```"
  ([n word]
    (tokenize-edge-ngrams n n word))
  ([min-n max-n word]
    (for [n (range min-n (inc (min max-n (count word))))]
      (u.str/join (take n word)))))


(defn tokenize-keyword
  "Converts a keyword to tokens with and without keyword punctuation. Passes
  strings through as a single token."
  [x]
  (if (keyword? x)
    (if-let [ns (namespace x)]
      (let [s (str ns "/" (name x))]
        [ns (name x) s (str x)])
      [(str x) (name x)])
    [x]))


(defn stringify-keyword
  "Converts keywords to a string token, punctuation intact. Passes strings through
  as a single token."
  [x]
  [(cond-> x
     (keyword? x) str)])



(def default-tokenizers
  [stringify-keyword
   remove-diacritics
   tokenize-words
   (partial tokenize-edge-ngrams 1 10)])


(defn index-keys-from-string
  "Creates a collection of prepped and valid words from a string.
   Users may provide their own `valid-word-fn`."
  ([s] (prep-string-coll (u.str/split s #" ")))
  ([s valid-word-fn] (prep-string-coll (u.str/split s #" ") valid-word-fn)))


#_(prep-string-coll (u.str/split "Hello world" #" "))
#_(tokenize "Hello world" default-tokenizers)


(defn index-map-from-doc
  "Builds an index map from a document. A document is a map with two keys - `:id` and `:content`.
   The `:id` is the unique identifier for the document that the users can use during search to get the actual document.
   The `:content` key is the string whose words will be indexed.
   Users may provide an opts-map with keys `:maintain-actual?` and `:valid-word-fn`.
    - When `:maintain-actual?` is `true`, the actual indexed words are saved along with the encoded form of the words.
    - The `:valid-word-fn` is a custom word validator that users may provide.
   Note that maintaining actual words will consume additional space.
   Sample input:
   ```
   (index-map-from-doc {:id 1 :content \"World War 1\"} {:maintain-actual? true})
   ```
   Sample output:
   ```
   {\"W643\" [{:id 1, :actuals #{\"world\"}, :frequency 1}]
    \"W600\" [{:id 1, :actuals #{\"war\"}, :frequency 1}]}
   ```
   The `:id` is the same as supplied by the user.
   The value of `:frequency` is the frequency of the word in the `:content` string."
  [{:keys [id content]} {:keys [valid-word-fn maintain-actual?]}]
  (let [keys-coll (tokenize content default-tokenizers)
        #_(if valid-word-fn
            (index-keys-from-string content valid-word-fn)
            (index-keys-from-string content))]
    (loop [ks keys-coll
           res {}]
      (if (first ks)
        (recur (rest ks)
          (let [soundex-code (ph/soundex (first ks))]
            (assoc res soundex-code
              [(if maintain-actual?
                 {:id id
                  :actuals (set (conj (:actuals (first (res soundex-code))) (first ks)))
                  :frequency (inc (:frequency (first (res soundex-code)) 0))}
                 {:id id
                  :frequency (inc (:frequency (first (res soundex-code)) 0))})])))
        res))))


(defn index
  "Builds the final index map from a collection of documents. A document is a map with two keys - `:id` and `:content`.
   The `:id` is the unique identifier for the document that the users can use during search to get the actual document.
   The `:content` key is the string whose words will be indexed.
   Users may provide an opts-map with keys `:maintain-actual?` and `:valid-word-fn`.
    - When `:maintain-actual?` is `true`, the actual indexed words are saved along with the encoded form of the words.
    - The value of `:valid-word-fn` is a custom word validator that users may provide.
   The value of `:valid-word-fn` is a single arity fn that takes one word (string) and returns boolean.
   Note that maintaining actual words will consume additional space.
   Sample input:
   ```
   (text-index [{:id 1 :content \"World war 1\"}
                {:id 2 :content \"Independence for the world\"}]
               {:maintain-actual? true})
   ```
   Sample output:
   ```
   {\"W643\" [{:id 1, :actuals #{\"world\"}, :frequency 1}
              {:id 2, :actuals #{\"world\"}, :frequency 1}]
    \"W600\" [{:id 1, :actuals #{\"war\"}, :frequency 1}]
    \"I531\" [{:id 2, :actuals #{\"independence\"}, :frequency 1}]}
   ```
   The `:id` is the same as supplied by the user.
   The value of `:frequency` is the frequency of the word in the `:content` string."
  ([doc-coll]
    (index doc-coll {}))
  ([doc-coll opts-map]
    (loop [docs doc-coll
           res {}]
      (if (first docs)
        (recur (rest docs) (let [i (if (seq opts-map)
                                     (index-map-from-doc (first docs) opts-map)
                                     (index-map-from-doc (first docs) {}))]
                             (merge-with into res i)))
        res))))



(defn fetch-docs-for-a-word
  "Fetches all the indexed docs for a word."
  [w index]
  (index (ph/soundex w)))


(defn highest-similarity
  "Returns the similarilty value (between 0 and 1, both inclusive) for a word
  against the best similar word from a set of words."
  [w str-set]
  (let [st (first (sort-by #(fm/jaro-winkler w %) > str-set))]
    (fm/jaro-winkler w st)))


(defn scored-doc-for-a-word
  "Returns a map with doc-id as key and a map (with `:score`) as value.
   The supplied doc is expected to have `:id` and `:frequency` of the supplied word."
  [w doc]
  (if (:actuals doc)
    {(:id doc) {:score (* (highest-similarity w (:actuals doc)) (:frequency doc))}}
    {(:id doc) {:score (:frequency doc)}}))


(defn scored-docs-for-word
  "Returns a map with doc-ids as keys and a maps (with `:score`) as values.
   The supplied docs are expected to have `:id` and `:frequency` of the supplied word."
  [w docs]
  (loop [ds docs
         res {}]
    (if (first ds)
      (recur (rest ds) (into res (scored-doc-for-a-word w (first ds))))
      res)))


(defn merge-scored-docs
  "Example inputs: `{1 {:score 10 :data {:age 20}}}` and `{1 {:score 12 :data {:age 20}} 2 {:score 5 :data {:age 30}}}`
   Return: `{1 {:score 22 :data {:age 20}} 2 {:score 5 :data {:age 30}}}`
   If there are duplicate keys (values are equal):
    - The values of `:score` gets added.
    - The values of `:data` are expected to be equal in both the inputs, and will be returned as is.
   If the keys are not duplicate, then simply append as a key val pair."
  [m1 m2]
  (let [skeys (keys m2)]
    (loop [ks skeys
           res m1]
      (if (first ks)
        (recur (rest ks) (if (res (first ks))
                           (assoc-in res [(first ks) :score] (+ (:score (res (first ks)))
                                                               (:score (m2 (first ks)))))
                           (assoc res (first ks) (m2 (first ks)))))
        res))))


(defn scored-docs-for-str-coll
  "Fetches the docs for all the valid words of the supplied collection.
   The score for each fetched doc is combined (added) for each word of the words collection,
   to compute the final socre for the doc."
  [s-coll index]
  (loop [strs s-coll
         res {}]
    (if (first strs)
      (recur (rest strs)
        (merge-scored-docs res
          (scored-docs-for-word
            (first strs)
            (fetch-docs-for-a-word (first strs) index))))
      res)))


(defn score-comparator
  "Sorting comparator based on the score, either in increasing or in decreasing order.
   Defaults to decreasing order."
  [m & increasing?]
  (fn [key1 key2]
    (let [v1 [(get-in m [key1 :score]) key1]
          v2 [(get-in m [key2 :score]) key2]]
      (if (first increasing?)
        (compare v1 v2)
        (compare v2 v1)))))


(defn sorted-scored-docs
  "Results sorted by score, either in increasing or in decreasing order.
   Defaults to decreasing order."
  [s-coll index & increasing?]
  (let [res (scored-docs-for-str-coll s-coll index)]
    (into (sorted-map-by (if (first increasing?) (score-comparator res true) (score-comparator res)))
      res)))


(defn scored-docs
  "Results from the index map based on the supplied words collection.
   Return value may or may not be sorted based on provided options.
   Defaults to non-sorted result."
  [s-coll index & opts-map]
  (let [opts (first opts-map)
        sorted? (:sorted? opts)
        increasing? (:increasing? opts)]
    (cond
      (and sorted? increasing?) (sorted-scored-docs s-coll index true)
      (and sorted? (not increasing?)) (sorted-scored-docs s-coll index)
      :else (scored-docs-for-str-coll s-coll index))))


(defn default-fetch-fn
  "The `db` is expected to be a map with doc-ids as keys and maps as values.
   Each value map may contain the data that needs to come out of the `db`."
  [db doc-ids]
  (into {} (map #(hash-map % (db %))) doc-ids))


(defn scored-docs-with-data
  "If a custom fetch fn is not provided in the opts-map, then the `db` is expected to be a map with doc-ids as keys and maps as values.
   The return value of fetch-fn is expected to be a map or a collection similar to:
   `{1 {:data {}} 2 {:data {}}}` or `[{1 {:data {}}} {2 {:data {}}}]`."
  [s-coll index db & opts-map]
  (let [docs (scored-docs s-coll index (first opts-map))
        ids (keys docs)
        custom-fetch-fn (:fetch-fn (first opts-map))
        docs-data (if custom-fetch-fn (custom-fetch-fn db ids)
                      (default-fetch-fn db ids))]
    (cond
      (map? docs-data) (merge-with into docs docs-data)
      (or (vector? docs-data)
        (list? docs-data)) (loop [data docs-data
                                  res docs]
                             (if (first data)
                               (recur (rest data) (merge-with into res (first data)))
                               res))
      :else docs)))


(defn search
  "Search the index with a string of one or more words.
   Various options can be provided:
    - `:db` - If provided, the return value will contain additional data from the db based on the doc-ids returned by the index.
    - `:fetch-fn` - A function with args signature `[db doc-ids]`. Exists only with `:db` key.
                    It is expected to return results in the form similar to `{1 {:data {}} 2 {:data {}}}` or `[{1 {:data {}}} {2 {:data {}}}]`.
                    The key `:data `and its value could be any key and value.
    - `:sorted?` - If `true`, the result should be sorted. Defaults to decreasing order of sorting.
    - `:increasing?` - Exists only with `:sorted?` key, a `true` value indicates the sorting to be in the increasing order.
    - `:valid-word-fn` - A single arity fn that takes one word (string) and returns boolean."
  [query-string index & opts-map]
  (let [opts (first opts-map)
        db (:db opts)
        str-coll (index-keys-from-string query-string (:valid-word-fn opts))]
    (if db
      (scored-docs-with-data str-coll index db opts)
      (scored-docs str-coll index opts))))
