(ns leafgrabber.free-text.compare
  (:use [cascalog.api :only (lfs-textline hfs-textline defmapop defbufferop deffilterop defmapcatop <- ?<-)]
        [clojure.contrib.string :only (as-str)]
        [clojure.contrib.str-utils :only (re-split)]
        [clojure.data.json :only (json-str read-json)]
        )
  (:require [leafgrabber.free-text
             [attribute :as att]
             [extractor :as ext]
             [utils :as ftu]
             [query :as qry]
             [author :as aut]]
            [leafgrabber.core :as lgc]
            [clojure.contrib.string :as cst]
            [cascalog.ops :as cop]))

(defn get-lg-att-val
  [lg-json lg-att]
  (let [lg-map (read-json lg-json)
        lg-orig ((keyword lg-att) lg-map)
        lg-triple (if (empty? lg-orig) "|no-evidence|" lg-orig)
        lg-middle (second (re-split #"\|" lg-triple))
        lg-conv (cond (= lg-middle "yes") "true"
                      (= lg-middle "no") "false"
                      true "no-evidence")]
    (json-str (hash-map (str lg-att "-lg") lg-conv))
  ))

(defn convert-lg-att-q
  [sink-dir lg-dir lg-att]
  (println)
  (println "converting leafgrabber " lg-att " to free-text format")
  (println "putting results in " sink-dir)
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?uuid ?lg-conv]
       ((hfs-textline lg-dir) ?line)
       (ftu/get-field ?line 0 :> ?uuid)
       (ftu/get-field ?line 2 :> ?lg-json)
       (get-lg-att-val ?lg-json lg-att :> ?lg-conv))
  )

(defn convert-lg-att
  [ft-att lg-att]
  (let [ft-dir (str qry/*results-dir* "/" (as-str ft-att) "/lg." lg-att)
        lg-dir qry/*lg-merge-dir*]
    (convert-lg-att-q ft-dir lg-dir lg-att)
    )
  )

(defbufferop make-enum-trained-json
  "Make a json string for a set of values.
   The input to this operation is the set of tuples for some UUID.
   Each tuple is <raw-ext-val, att-name>.
   The attribute name should be the same for every tuple"
  [tuples]
  (let [att-name (second (first tuples))
                                        ;values (map first tuples)
        values (map first tuples)
        agg-ext-value (ext/sum-enum-ext-aggregator values)
        agg-att-value (att/mode-key (read-json agg-ext-value) '("no-evidence") 0)]
    (list (json-str (hash-map att-name agg-att-value)))
   ))

(defn convert-enum-train-set-q
  "A query that turns a training.set file into a json.att.val-style file.
  The first field of the training set should be the UUID; the third field
  should be the extractor; the fourth field should be the raw extractor value"
  [sink-dir train-dir att-name]
  (println)
  (println "Converting " train-dir " to " sink-dir)
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?uuid ?json]
       ((hfs-textline train-dir) ?line-1)
       (ftu/get-field ?line-1 0 :> ?uuid)
       (ftu/get-field ?line-1 3 :> ?val)
       (make-enum-trained-json ?val att-name :> ?json)
      ))

(defn convert-any-train-set
  "Turn a training.set file into a json.att.val-style file

   base-dir - the larger directory the data are in
   in       - the training set file
   out      - the json file to produce
   new-att  - what to call the generated attribute"
  [base-dir in out new-att]
  (let [in-dir (str base-dir "/" in)
        out-dir (str base-dir "/" out)]
    (convert-enum-train-set-q out-dir in-dir (as-str new-att))
  ))

(defn join-json-gold
  "Produces a new json value including gold and gold-size versions
   of the given attribute"
  [json-in gold-in att-str]
  (let [orig-json (read-json json-in)
        gold-json (if gold-in (read-json gold-in) {})
        gold-val (or ((keyword att-str) gold-json) "no-evidence")
        orig-with-gold (assoc orig-json "gold" gold-val)
        gold-count-val (if ((keyword att-str) gold-json) "true" "no-evidence")
        orig-with-two (assoc orig-with-gold "gold-size" gold-count-val)
        orig-with-all (assoc orig-with-two "universe" "true")]
    (json-str orig-with-all)
   ))

(defn join-json-train-set-q
  "The query that does the work for join-json-train-set"
  [sink-dir json-dir gold-dir att]
  (println)
  (println "Adding training information from " gold-dir)
  (println "Into json file " json-dir)
  (println)
  (let [json-tap (<- [?uuid ?json-in]
                     ((hfs-textline json-dir) ?json-line)
                     (ftu/get-field ?json-line 0 :> ?uuid)
                     (ftu/get-field ?json-line 1 :> ?json-in))
        gold-tap (<- [?uuid ?gold-in]
                     ((hfs-textline gold-dir) ?gold-line)
                     (ftu/get-field ?gold-line 0 :> ?uuid)
                     (ftu/get-field ?gold-line 1 :> ?gold-in))]
    (ftu/hadoop-delete sink-dir)
    (?<- (hfs-textline sink-dir) [?uuid ?json-join]
         (json-tap ?uuid !!json-in)
         (gold-tap ?uuid !!gold-in)
         (join-json-gold !!json-in !!gold-in (as-str att) :> ?json-join)
         )
   ))

(defn join-json-train-set
  "Takes two json-valued files, one generated from a normal attribute and
   one generated from an annotated training set. Produces
   another json-valued file where the json includes gold and
   gold-count attributes from the training set.

   att      - the attribute
   json-in  - the json value file
   train-in - the json training file
   json-out - the resulting json value file"
  [att json-in train-in json-out]
  (let [base-dir (str qry/*results-dir* "/" (as-str att))
        json-in-dir (str base-dir "/" json-in)
        train-in-dir (str base-dir "/" train-in)
        json-out-dir (str base-dir "/" json-out)]
    (join-json-train-set-q json-out-dir json-in-dir train-in-dir att)
  ))

(defn project-json
  [json-in att-name]
  (let [json-map (if json-in (read-json json-in) {})
        att-val (or ((keyword att-name) json-map) "no-evidence")]
    (json-str (hash-map att-name att-val)))
  )

(defn project-json-atts-q
  "project an attribute from a json file"
  [sink-dir att-dir att-name]
  (println)
  (println "Projecting " att-name " from " att-dir)
  (println "into " sink-dir)
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?uuid ?proj-json]
       ((hfs-textline att-dir) ?line)
       (ftu/get-field ?line 0 :> ?uuid)
       (ftu/get-field ?line 1 :> ?json)
       (project-json ?json att-name :> ?proj-json)))

(defn join-json-universe
  "Make a hash map combining two attributes plus 'universe'"
  [json1-in att1-name json2-in att2-name]
  (let [json-map-1 (if json1-in (read-json json1-in) {})
        att1-val (or ((keyword att1-name) json-map-1) "no-evidence")
        json-map-2 (if json2-in (read-json json2-in) {})
        att2-val (or ((keyword att2-name) json-map-2) "no-evidence")]
    (json-str (hash-map att1-name att1-val att2-name att2-val "universe" "true"))
  ))

(defn join-json-atts-q
  "The query that does the work for join-json-atts"
  [sink-dir att1-dir att1-name att2-dir att2-name]
  (println)
  (println "Merging " att1-dir " " att1-name)
  (println "and " att2-dir " " att2-name)
  (println "into " sink-dir)
  (println)
  (let [att1-tap (<- [?uuid ?json1-in]
                     ((hfs-textline att1-dir) ?att1-line)
                     (ftu/get-field ?att1-line 0 :> ?uuid)
                     (ftu/get-field ?att1-line 1 :> ?json1-in))
        att2-tap (<- [?uuid ?json2-in]
                     ((hfs-textline att2-dir) ?att2-line)
                     (ftu/get-field ?att2-line 0 :> ?uuid)
                     (ftu/get-field ?att2-line 1 :> ?json2-in))
        join-tap (<- [?uuid !!json1-in ?json2-in]
                     (att1-tap ?uuid !!json1-in)
                     (att2-tap ?uuid ?json2-in))]
    (ftu/hadoop-delete sink-dir)
    (?<- (hfs-textline sink-dir) [?uuid ?json-join]
         (join-tap ?uuid ?json1-in ?json2-in)
         (join-json-universe ?json1-in att1-name ?json2-in att2-name :> ?json-join)
         )
    ))

(defn join-json-atts
  "Make a new json value file from attributes from two other files

   base-dir  - the larger directory it is all stored in
   att1-in   - the json value file containing the first attribute
   att1-name - the name of the first attribute
   att2-in   - the json value file containing the second attribute
   att2-name - the name of the second attribute"
  [base-dir att1-in att1-name att2-in att2-name json-out]
  (println)
  (println base-dir)
  (println att1-in)
  (println att1-name)
  (println att2-in)
  (println att2-name)
  (println json-out)
  (println)
  (let [att1-in-dir (str base-dir "/" att1-in)
        att2-in-dir (str base-dir "/" att2-in)
        json-out-dir (str base-dir "/" json-out)]
    (join-json-atts-q json-out-dir att1-in-dir att1-name att2-in-dir att2-name)
  ))

(defn json-att-count-q
  "Count the UUIDs in a json.att.val directories by the json value"
  [sink-dir json-att-dir]
  (println)
  (println "Counting UUIDs by value vector")
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?json ?count]
       ((hfs-textline json-att-dir) ?line)
       (ftu/get-field ?line 1 :> ?json)
       (cop/count ?count)
      ))

(defn count-json
  "Create a count file by counting the UUIDs in a json.att.val file by json value

   base-dir - the larger directory the raw data is stored in
   to-count - the json.att.val file to count
   local-file - where to write the count file"
  [base-dir to-count local-file]
  (let [temp-dir (str base-dir "/" ftu/date-str "/json.att.count")
        to-count-dir (str base-dir "/" to-count)]
    (json-att-count-q temp-dir to-count-dir)
    (ftu/exec-command (str "hadoop fs -cat " temp-dir "/p* > " local-file))
    )
  )

(defn count-tuples
  "Get hash-map to count tuples from a count file"
  [count-file]
  (let [raw-string (slurp count-file)
        raw-lines (re-split #"\n" raw-string)
        split-lines (map #(re-split #"\t" %) raw-lines)
        map-lines (map #(list (read-json (first %)) (read-string (second %))) split-lines)]
    map-lines
  ))

(defn informative-count
  "Determine how many UUIDs have a non-no-evidence value for
   a given attribute

   tuples - the tuples that store the counts
   att    - the attribute
   count  - an accumulator for the final result"
  [tuples att count]
  (if (empty? tuples)
    count
    (let [tuple (first tuples)
          att-val (att (first tuple))
          tuple-count (second tuple)]
      (if (and att-val (not (= att-val "no-evidence")))
        (recur (rest tuples) att (+ count tuple-count))
        (recur (rest tuples) att count))
   )))

(defn agree-count
  "Determine how many UUIDs share the same, non-no-evidence value for
   two attributes

   tuples - the tuples that store the counts
   att1   - the first attribute
   att2   - the second attribute
   count  - an accumulator for the final result"
  [tuples att1 att2 count]
  (if (empty? tuples)
    count
    (let [tuple (first tuples)
          att1-val (att1 (first tuple))
          att2-val (att2 (first tuple))
          tuple-count (second tuple)]
      (if (and att1-val
               att2-val
               (= att1-val att2-val)
               (not (= att1-val "no-evidence")))
        (recur (rest tuples) att1 att2 (+ count tuple-count))
        (recur (rest tuples) att1 att2 count))
   )))

(defn compare-atts
  "Find the confusion value between two attributes

   tuples - the tuples that store the counts
   att1   - the first attribute
   att2   - the second attribute"
  [tuples att1 att2]
  (if (= att1 att2)
    (informative-count tuples att1 0)
    (agree-count tuples att1 att2 0)
  ))

(defn make-confusion-matrix
  "Make a confusion matrix from a count file

  count-file - the local count file to use
  conf-file  - the local file to put the confusion matrix into"
  [count-file conf-file]
  (let [tuples (count-tuples count-file)
        atts (map first (first (first tuples)))
        header (apply str (interpose "\t" (cons " " atts)))
        lines (for [att1 atts]
                (apply str (interpose "\t" (cons att1 (for [att2 atts]
                                                        (compare-atts tuples att1 att2))))))
        all-lines (apply str (interpose "\n" (cons header lines)))]
    (spit conf-file all-lines)
  ))

(defn confusion-from-training
  "Make a confusion matrix from an attribute and its training set

   att      - the attribute to compare
   train-in - the name of the training set
   date     - the date of the run containing the json.att.val to compare

   A local file <base-att-name>.count is created with the raw counts
   The local file <base-att-name>.conf contains the confusion matrix"
  [att train-in date]
  (let [base-dir (str qry/*results-dir* "/" (as-str att))
        json-in (str date "/json.att.val")
        local-count (str (as-str att) ".count")
        local-conf (str (as-str att) ".conf")]
    (convert-any-train-set base-dir train-in "json.train" att)
    (join-json-train-set att json-in "json.train" "json.merge")
    (count-json base-dir "json.merge" local-count)
    (make-confusion-matrix local-count local-conf)
    ))

(defn confusion-from-atts
  "Make a confusion matrix from two attributes

   base-att-name - the attribute or directory name the data files are in
   att1-dir      - the json file for the first attribute
   att1-name     - the name of the first attribute
   att2-dir      - the json file for the second attribute
   att2-name     - the name of the second attribute

   A local file <base-att-name>.count is created with the raw counts
   The local file <base-att-name>.conf contains the confusion matrix"
  [base-att-name att1-dir att1-name att2-dir att2-name]
  (let [base-dir (str qry/*results-dir* "/" (as-str base-att-name))
        local-count (str (as-str base-att-name) ".count")
        local-conf (str (as-str base-att-name) ".conf")]
    (join-json-atts base-dir att1-dir att1-name att2-dir att2-name "json.merge")
    (count-json base-dir "json.merge" local-count)
    (make-confusion-matrix local-count local-conf)
    )
  )

(defn prec-rec-cov-helper
  [tuples gold-att other-att results]
  (if (empty? tuples)
    results
    (let [tuple (first tuples)
          gold-val (gold-att (first tuple))
          other-val (other-att (first tuple))
          tuple-count (second tuple)
          results-1 (if (and (= gold-val other-val) (not (= gold-val "no-evidence")))
                      (assoc results 0 (+ tuple-count (get results 0)))
                      results)
          results-2 (if (not (= other-val "no-evidence"))
                      (assoc results-1 1 (+ tuple-count (get results-1 1)))
                      results-1)
          results-3 (if (not (= gold-val "no-evidence"))
                      (assoc results-2 2 (+ tuple-count (get results-2 2)))
                      results-2)
          results-4 (assoc results-3 3 (+ tuple-count (get results-3 3)))]
      (recur (rest tuples) gold-att other-att results-4)
   )))

(defn prec-rec-cov
  "Take a local count file, the name of a gold-standard attribute and the name
   of another attribute to compare it to - return a vector made up of
   [correct-found all-found all-correct universe]"
  [local-count gold-att other-att]
  (let [tuples (count-tuples local-count)
        stats (prec-rec-cov-helper tuples (keyword gold-att) (keyword other-att) [0,0,0,0])]
    stats)
  )

(defn coverage-only-helper
  [tuples att results]
  (if (empty? tuples)
    results
    (let [tuple (first tuples)
          value (att (first tuple))
          tuple-count (second tuple)
          results-1 (if (not (= value "no-evidence"))
                      (assoc results 1 (+ tuple-count (get results 1)))
                      results)
          results-2 (assoc results-1 3 (+ tuple-count (get results-1 3)))]
      (recur (rest tuples) att results-2)
   )))

(defn coverage-only
  [local-count att]
  (let [tuples (count-tuples local-count)
        stats (coverage-only-helper tuples (keyword att) [0,0,0,0])]
    stats))

(defn single-att-prec-rec
  "Compare an attribute in a given multi-directory with its annotated value

   att - the attribute to compare
   test-json-dir - the name of the multi-att directory"
  [att test-json-dir]
  (let [att-name (as-str att)
        gold-name (str att-name "-gold")
        current-test-dir (str qry/*results-dir* "/" test-json-dir "/" ftu/date-str "/json.att.val")
        train-dir (str qry/*results-dir* "/" att-name)
        gold-dir (str train-dir "/json.train.val")
        merge-dir (str train-dir "/json.merge")
        local-count (str att-name ".count")]
    (if (not (= (ftu/hadoop-ls current-test-dir) 0))
      (qry/run-ft {:atts aut/*atts-to-run* :dir test-json-dir}))
    (if (= (ftu/hadoop-ls gold-dir) 0)
      (do
        (join-json-atts-q merge-dir current-test-dir att-name gold-dir gold-name)
        (count-json train-dir "json.merge" local-count)
        (prec-rec-cov local-count gold-name att-name))
      (do
        (project-json-atts-q merge-dir current-test-dir att-name)
        (count-json train-dir "json.merge" local-count)
        (coverage-only local-count att-name)
        )
      )
    ))
