(ns babble.core
  (:import [net.sourceforge.pinyin4j PinyinHelper]
           [net.sourceforge.pinyin4j.format
               HanyuPinyinCaseType
               HanyuPinyinOutputFormat
               HanyuPinyinToneType
               HanyuPinyinVCharType])
  (:require [base.core :as base]
            [base.hash :as h]
            [clj-http.client :as client]
            [clojure.data.json :as json]
            [clojure.java.io :as io]
            [clojure.java.jdbc :as jdbc]
            [clojure.string :as string]
            [stardate :as sd])
  (:gen-class))

(def api-url "https://www.googleapis.com/language/translate/v2")

(def api-key (future (base/db-getenv-global "GOOGLE_API_KEY")))

(defn nfc
  "Normalize string to Unicode NFC"
  [s]
  (java.text.Normalizer/normalize s java.text.Normalizer$Form/NFC))

(defn nfc?
  "Test if a string is normalized as NFC"
  [s]
  (java.text.Normalizer/isNormalized s java.text.Normalizer$Form/NFC))

(def color-scheme [ 91 33 32 94 39 ])

(defn comprehension
  "Estimate what fraction of characters you'd recognize if you know the n most common characters. Data taken from http://www.zein.se/patrick/3000en.html."
  [n]
  (- (Math/expm1 (+ (* -0.0014212309563329077 n) -0.659214739836596))))

(defn colorize
  "Colorize a pinyin string with tone numbers"
  [s]
  (let [x (string/split s #" ")]
    (string/join " "
      (map
        (fn [y]
          (base/colorize y (get color-scheme
                                (- (Integer.
                                     (or
                                       (re-find #"[1-4]$" y)
                                       5))
                                   1))))
        x))))

(defn pinyin
  "Display s using pinyin. Available options are :unicode, :colon, :v,
  :tone-mark, :tone-number, :toneless, :color, :uppercase, and :lowercase."
  [s & opts]
  (let [o (HanyuPinyinOutputFormat.)]
    ; Set default options
    (.setVCharType o HanyuPinyinVCharType/WITH_U_UNICODE)
    (.setToneType o HanyuPinyinToneType/WITH_TONE_MARK)
    (def colorized false)
    ; Modify options according to caller's preference
    (doall
      (for [x opts]
        (case x
          :unicode (.setVCharType o HanyuPinyinVCharType/WITH_U_UNICODE)
          :colon (.setVCharType o HanyuPinyinVCharType/WITH_U_AND_COLON)
          :v (.setVCharType o HanyuPinyinVCharType/WITH_V)
          :tone-mark (do ; tone marks imply Unicode
                       (.setVCharType o HanyuPinyinVCharType/WITH_U_UNICODE)
                       (.setToneType o HanyuPinyinToneType/WITH_TONE_MARK))
          :tone-number (.setToneType o HanyuPinyinToneType/WITH_TONE_NUMBER)
          :toneless (.setToneType o HanyuPinyinToneType/WITHOUT_TONE)
          :color (do ; color implies tone-number
                     (.setToneType o HanyuPinyinToneType/WITH_TONE_NUMBER)
                     (def colorized true))
          :uppercase (.setCaseType o HanyuPinyinCaseType/UPPERCASE)
          :lowercase (.setCaseType o HanyuPinyinCaseType/LOWERCASE))))
    (let [out (PinyinHelper/toHanyuPinyinString s o " ")]
      (if colorized (colorize out) out))))

(defn radical
  ([n] (radical n 1))
  ([n v]
    (let [rows (jdbc/query base/pgdb [
          "SELECT r FROM babble.radical_variants
           WHERE n = ? and v = ?"
          n v])]
    (if (> (count rows) 0) (:r (first rows)) nil))))

(defn written
  "Show full pinyin form"
  [s]
  (str s " (" (pinyin s :tone-mark) ")"))

(defn insert-into-db
  "Insert translation results into database"
  [lsrc ltrg src trg]
  (jdbc/execute! base/pgdb [
    "INSERT INTO babble.translations
     (lsrc, ltrg, src, trg)
     VALUES (?, ?, ?, ?);"
    lsrc ltrg src trg])
  trg)

(defn query-translation-db
  "Query translation results from databae"
  [lsrc ltrg src]
  (let [rows (jdbc/query base/pgdb [
                "SELECT trg FROM babble.translations
                 WHERE lsrc = ? AND ltrg = ? and
                       lower(src) = lower(?);"
                lsrc ltrg src])]
    (if (> (count rows) 0) (:trg (first rows)) nil)))

(defn translate-via-google-api
  "Translate using Google API"
  [lsrc ltrg s]
  (let [r (client/get api-url {:query-params
                              {:key @api-key
                               :source lsrc
                               :target ltrg
                               :format "text"
                               :prettyprint false
                               :q s }})
        b (:body r)
        j (json/read-str b :key-fn keyword)
        d (:data j)
        tr (:translations d)]
    (when (> (count tr) 1)
      (base/log-local "babble" "warn" "More than 1 translation"
                {:translations tr :source lsrc :target ltrg :q s}))
    (insert-into-db lsrc ltrg s (:translatedText (first tr)))))

(defn translate
  "Translate s from lsrc to ldst"
  [lsrc ltrg s]
  (or (query-translation-db lsrc ltrg s)
      (translate-via-google-api lsrc ltrg s)))

(defn simplify
  "Convert a string from Traditional Chinese to Simplified Chinese"
  [x]
  (translate "zh-TW" "zh-CN" x))

(defn simplified?
  "Tests if a string is in Simplified Chinese"
  [x]
  (= x (simplify x)))

(defn traditionalize
  "Convert a string from Simplified Chinese to Traditional Chinese"
  [x]
  (translate "zh-CN" "zh-TW" x))

(defn traditional?
  "Tests if a string is in Traditional Chinese"
  [x]
  (= x (traditionalize x)))

(defn tr
  "Translate a string from Simplified Chinese to English"
  [x]
  (translate "zh-CN" "en" x))

(def multidigest
  (memoize
    (fn [s]
      (h/multi (.getBytes (nfc s) "UTF-8")))))

(defn crc32
  "Returns the hex string form of CRC32 of the UTF-8 (NFC) encoding of s"
  [s]
  (:crc32 (multidigest s)))

(defn md5
  "Returns the MD5 hash of the UTF-8 (NFC) encoding of s"
  [s]
  (:md5 (multidigest s)))

(defn sha1
  "Returns the SHA1 hash of the UTF-8 (NFC) encoding of s"
  [s]
  (:sha1 (multidigest s)))

(defn sha2-256
  "Returns the SHA2 (256 byte) hash of the UTF-8 (NFC) encoding of s"
  [s]
  (:sha2_256 (multidigest s)))

(defn
  parse-cc-cedict-line
  [s]
  (let [[_ traditional simplified pinyin r]
          (re-matches #"^(\S+)\s+(\S+)\s*\[([^\]]+)\]\s*\/(.+)/$" s)
        english (string/split r #"/")]
        { :traditional traditional 
          :simplified simplified
          :pinyin pinyin
          :colorized-pinyin (colorize pinyin)
          :English english }))

(def
  cc-cedict
  (future
    (let
      [raw (slurp (io/file (System/getenv "HOME")
                           "Documents"
                           "Clojure"
                           "babble"
                           "resources"
                           "cedict_1_0_ts_utf-8_mdbg.txt")
                  :encoding "UTF-8")
       lines (string/split-lines raw)
       uncommented (filter #(not (re-find #"^#" %)) lines)]
      (doall (map parse-cc-cedict-line uncommented)))))

(defn display-cc-cedict
  [x]
  {
    :traditional (or (= (:simplified x) (:traditional x)) (:traditional x))
    :pinyin (colorize (:pinyin x))
    :English (:English x)
   })

(defn l
  "Lookup and display Simplified Chinese string in CC-CEDICT"
  [s]
  (clojure.pprint/pprint {
    :simplified s
    :crc32 (crc32 s)
    :cc-cedict (mapv display-cc-cedict
                 (filter #(= (:simplified %) s) @cc-cedict))}))

(defn lt
  "Lookup a Traditional Chinese string in CC-CEDICT"
  [s]
  (filter #(= (:traditional %) s) @cc-cedict))

(defn display
  "Display form for a (presumably) Simplified Chinese string"
  [x]
  (let [s (simplify x)
        m (multidigest s)
        d (array-map :source x
               :crc32 (crc32 x)
               :length (count x)
               :simplified s
               :pinyin (pinyin s :v :color)
               :traditional (if (traditional? x) true (traditionalize x))
               :English (tr x))
        l (assoc d :pinyin (pinyin s :v :tone-number)
                   :md5 (:md5 m)
                   :sha1 (:sha1 m)
                   :sha2-256 (:sha2_256 m)
                   :sha2-512 (:sha2_512 m)
                   :sha3-256 (:sha3_256 m))]
    (base/log-local "babble" "info" (str "display " s) l)
    d))

(defn p
  "Pretty-print a Simplified Chinese string"
  [s]
  (clojure.pprint/pprint (display (string/trim s)))
  s)

(def lookup-radical (memoize
  (fn [s]
    (let [row (first (jdbc/query base/pgdb [
              "SELECT n, v FROM babble.radical_variants
              WHERE r = ?;" (str s)]))]
      (if row
        {
          :n (:n row)
          :v (:v row)
        }
        nil)))))

(def ^{:private true} decompose1 (memoize
  (fn [s]
    (let [ss (str s)
          row (first (jdbc/query base/pgdb [
              "SELECT k_crc32, cmd, components FROM babble.cjkdecomp
               WHERE k = ?;" s]))]
      (if row
        (let [c1 (json/read-str (str (:components row)))]
          (if (or (= c1 [""]) (lookup-radical s))
              ss ; no further decomposition is possible
              { ss (mapv decompose1 c1) }
          )
        )
        nil
      )
    )
  )
))

(defn decompose
  "Recursively decompose a string into character components"
  [s]
  (mapv decompose1 (string/trim s)))
