(ns je.suis.un-petit-index
  (:require [clojure.set :refer [intersection]]))

(defn- normalize-nfd [^String s]
  (java.text.Normalizer/normalize s java.text.Normalizer$Form/NFD))

(defn- two-gram [cs]
  (persistent!
   (loop [acc (transient [])
          [head & cs] cs]
     (if cs
       (recur (conj! acc [head (first cs)]) cs)
       acc))))

(defn- mkgrams
  ([s] (mkgrams s 6))
  ([s n]
  (let [cs (vec (.toLowerCase (normalize-nfd (str s))))]
    (reduce
     into #{}
     (reduce
      (fn [acc _] (cons (two-gram (first acc)) acc))
      (list cs)
      (range 0 n))))))

(defn normalized-compare
  ([^String a ^String b] (normalized-compare a b 6))
  ([^String a ^String b n]
   (let [sa (mkgrams a n)
         sca (count sa)
         sci (count (intersection sa (mkgrams b n)))]
     (/ sci sca))))

(defn gramdb-to-index [db]
  (reduce
   (fn [acc [k grams]]
     (reduce #(update %1 %2 (fnil conj #{}) k) acc grams)) {} db))

(defn map-to-gramdb
  ([m] (map-to-gramdb m 6))
  ([m n]
  (into {} (map (fn [[k v]] [k (mkgrams v n)]) m))))

(defn normalized-gramdb-query
  ([g q] (normalized-gramdb-query g q 6))
  ([g q n]
  (let [qg (mkgrams q n)
        qgc (count qg)]
    (->> g
         (pmap (fn [[k v]] [k (count (intersection qg v))]))
         (filter (fn [[k v]] (not (zero? v))))
         (pmap (fn [[k v]] [k (/ v qgc)]))
         (into {})))))

(defn normalized-index-query
  ([i q] (normalized-index-query i q 6))
  ([i q n]
  (let [qg (mkgrams q n)
        qgc (count qg)]
    (->> (select-keys i qg)
         (vals)
         (apply concat)
         (frequencies)
         (pmap (fn [[k v]] [k (/ v qgc)]))
         (into {})))))

(let [database
      {
       :제26조1 "모든 사람은 교육을 받을 권리를 가진다 . 교육은 최소한 초등 및 기초단계에서는 무상이어야 한다. 초등교육은 의무적이어야 한다. 기술 및 직업교육은 일반적으로 접근이 가능하여야 하며, 고등교육은 모든 사람에게 실력에 근거하여 동등하게 접근 가능하여야 한다."
       :제26조2 "교육은 인격의 완전한 발전과 인권과 기본적 자유에 대한 존중의 강화를 목표로 한다. 교육은 모든 국가 , 인종 또는 종교 집단간에 이해, 관용 및 우의를 증진하며 , 평화의 유지를 위한 국제연합의 활동을 촉진하여야 한다."
       :제26조3 "부모는 자녀에게 제공되는 교육의 종류를 선택할 우선권을 가진다 ."
       :제27조1 "모든 사람은 공동체의 문화생활에 자유롭게 참여하며 예술을 향유하고 과학의 발전과 그 혜택을 공유할 권리를 가진다 ."
       :제27조2 "모든 사람은 자신이 창작한 과학적 , 문학적 또는 예술적 산물로부터 발생하는 정신적, 물질적 이익을 보호받을 권리를 가진다 ."
       :foo "this, that, and the other thing thing. A rose by any other name is as sweet."
       :bar "Another one bites the dust. To be, or not to be, that is the question; whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune; or to take arms against a sea of troubles and by opposing, end them."
       :baz "To die, to sleep. No more; and by a sleep to say we end the heartache and the thousand natural shocks that flesh is heir to. 'Tis a consummation devoutly to be wished. To die. To sleep."
       :qux "To sleep. Perchance to dream; ay, there's the rub: For in that sleep of death, what dreams may come when we have shuffled off this mortal coil, must give us pause."
       :quux 69420
       :quuux 42069
       :fungus :balthazar
       :virus :bazaar
       :helicopter #{:fungus "trilogy" :balance}
       :wiki-a "design kernels that allow machine learning algorithms such as support vector machines to learn from string data"
       :wiki-b "find likely candidates for the correct spelling of a misspelled word"
       :wiki-c "improve compression in compression algorithms where a small area of data requires n-grams of greater length"
       :wiki-d "assess the probability of a given word sequence appearing in text of a language of interest in pattern recognition systems, speech recognition, OCR (optical character recognition), Intelligent Character Recognition (ICR), machine translation and similar applications"
       :wiki-e "improve retrieval in information retrieval systems when it is hoped to find similar \"documents\" (a term for which the conventional meaning is sometimes stretched, depending on the data set) given a single query document and a database of reference documents"
       :wiki-f "improve retrieval performance in genetic sequence analysis as in the BLAST family of programs"
       :wiki-g "identify the language a text is in or the species a small sequence of DNA was taken from"
       :wiki-h "predict letters or words at random in order to create text, as in the dissociated press algorithm."
       :wiki-i "cryptanalysis"
       }
      gramdb (map-to-gramdb database)
      index (gramdb-to-index gramdb)]
  ;; the transpose between gramdb and index is reversible
  (assert (= gramdb (gramdb-to-index index)))
  ;; the result of a query on the gramdb or on the index is identical for the same query
  (assert (= (normalized-index-query index   "#{:balance :fungus}")
             (normalized-gramdb-query gramdb "#{:balance :fungus}")))
  (assert (= (normalized-index-query index   420)
             (normalized-gramdb-query gramdb 420)))
  (assert (= (normalized-index-query index   "한다.")
             (normalized-gramdb-query gramdb "한다."))))
