(ns leafgrabber.free-text.sample
  (:use [cascalog.api :only (lfs-textline hfs-textline deffilterop defmapcatop defbufferop <- ?<-)]
        [clojure.contrib.string :only (as-str)]
        [clojure.contrib.duck-streams :only (read-lines)]
        [clojure.contrib.shell-out :only (sh)]
        [clojure.contrib.str-utils :only (re-split)]
        [clojure.data.json :only (json-str read-json)]
        [clojure.set :only (union intersection difference)]
        )
  (:require [leafgrabber.core :as lgc]
            [leafgrabber.free-text.query :as qry]
            [leafgrabber.free-text.utils :as ftu]
            [leafgrabber.free-text.extractor :as ext]
            [leafgrabber.free-text.tables :as tbl]
            [leafgrabber.free-text.author :as aut]
       ))

(defn random-entries
  "create a random sample of the lines in a value file"
  [source-dir dest-dir count]
  (let [dest-file (str dest-dir "/part-00000")
        command1 (str "hadoop fs -cat " source-dir "/part-* | shuf -n " count " > temp.sample")
        command3 (str "hadoop fs -copyFromLocal temp.sample " dest-file)
        command4 (str "rm temp.sample")]
    (ftu/exec-command command1)
    (ftu/hadoop-delete dest-dir)
    (ftu/exec-command command3)
    (ftu/exec-command command4)
  ))

(defn validator-value
  "Get the set of validator values, prefixes, cores and suffixes for a
   given string and extractor, but only if there is a core match
   (contrast with validator-extractor-value below)"
  [page ext-str]
  (let [extractor ((keyword ext-str) tbl/*extractor-table*)
        validator (:validator extractor)
        val-value (apply validator (list page))]
    (apply concat (map #(if (ftu/get-json-item % 1)
                          (list (list ext-str
                                      (ftu/get-json-item % 0)
                                      (ftu/get-json-item % 1)
                                      (ftu/get-json-item % 2)
                                      (ftu/get-json-item % 3)))
                          ())
                       val-value))
    ))

(defmapcatop add-validator-values
  "Look up a URL and return the validator value, prefix, core
   and suffix for each core match it contains."
  [url ext-str]
  (let [extractor (tbl/*extractor-table* (keyword ext-str))
        filter (:filter extractor)
        content-only (:content-only extractor)
        xpath (apply str (rest (apply str (map #(str "|.//" %) filter))))
        page (ext/url-to-filtered-html url xpath content-only)]
    (if (empty? page)
      ()
      (validator-value page ext-str)))
  )

(defn core-match-q
  "Get all the records from a UUID-URL file that match the core for
   any of a given set of extractors, and add information useful for
   authoring"
  [sink-dir src-dir ext-str]
  (println)
  (println "getting all records that match the core for " ext-str)
  (println "putting them into " sink-dir)
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?uuid ?url ?ext ?ext-val ?pref ?core ?suff]
       ((hfs-textline src-dir) ?line)
       (ftu/get-field ?line 0 :> ?uuid)
       (ftu/get-field ?line 1 :> ?url)
       (add-validator-values ?url ext-str :> ?ext ?ext-val ?pref ?core ?suff))
  )

(defn run-core-match
  "Using UUID/URL pairs from src.urls, run an extractor and return information
   useful for authoring:

   UUID
   URL
   EXT     - extractor id
   EXT-VAL - the value returned by the extractor
   PREFIX  - the prefix characters returned by the extractor's validator
   CORE    - the the string that matched the core
   SUFFIX  - the suffix characters returned by the extractor's validator

   This returns only rows that match the core of the extractor"
  [ext out-dir]
  (let [base-dir (str qry/results-dir "/" (as-str ext) "/" ftu/date-str)
        in-dir (str base-dir "/src.urls")]
    (core-match-q out-dir in-dir (as-str ext))
  ))

(defn validator-extractor-value
  "Get the set of extractor values (as returned by the validator) for a
   given string and extractor. (Contrast with validator-value above)"
  [text ext-str]
  (let [extractor ((keyword ext-str) tbl/*extractor-table*)
        validator (:validator extractor)
        val-value (apply validator (list text))]
    (mapcat #(list (list ext-str (ftu/get-json-item % 0))) val-value)
   ))

(defmapcatop add-validator-extractor-values
  "operator for auth-against-training-q"
  [pref core suff ext-str]
  (let [text (str pref core suff)]
    (validator-extractor-value text ext-str)
    )
  )

(defn auth-against-training-q
  "Run the query that does the work for run-auth-against-ann-training"
  [sink src ext-str]
  (?<- sink [?uuid ?ext-out ?ext-val ?exp-val ?pref ?core ?suff ?url]
       (src ?line)
       (ftu/get-field ?line 0 :> ?uuid)
       (ftu/get-field ?line 1 :> ?url)
       (ftu/get-field ?line 2 :> ?ext)
       (ftu/get-field ?line 3 :> ?exp-val)
       (ftu/get-field ?line 4 :> ?pref)
       (ftu/get-field ?line 5 :> ?core)
       (ftu/get-field ?line 6 :> ?suff)
       (add-validator-extractor-values ?pref ?core ?suff ?ext :> ?ext-out ?ext-val)
       )
  )

(defn run-auth-against-ann-training
  [ext]
  "create the raw.train.auth directory which contains the results
   of running the training set through the attribute

   UUID
   EXT     - Extractor ID
   EXT-VAL - The value returned by the extractor
   EXP-VAL - The expected value, as found in the training set
   PREFIX  - The prefix, as returned by the validator
   CORE    - The core that matched, as returned by the validator
   SUFFIX  - The suffix, as returned by the validator"
  (let [base-dir (str qry/results-dir "/" (as-str ext))
        in-dir (str base-dir "/training.set")
        out-dir (str base-dir "/" ftu/date-str "/raw.train.auth")
        out-tap (hfs-textline out-dir)
        in-tap (hfs-textline in-dir)]
    (println)
    (println "Running " ext " against training.set")
    (println "Building " out-dir)
    (println)
    (ftu/hadoop-delete out-dir)
    (auth-against-training-q out-tap in-tap (as-str ext))
    ))

(defbufferop make-trained-json
  [tuples]
  (map #(json-str (apply hash-map %))
       tuples)
  )

(defn convert-train-set-q
  [sink-dir train-dir ext-name]
  (println)
  (println "Building " sink-dir)
  (println "from training set " train-dir)
  (println "on extractor " ext-name)
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?uuid ?url ?json]
       ((hfs-textline train-dir) ?line)
       (ftu/get-field ?line 0 :> ?uuid)
       (ftu/get-field ?line 1 :> ?url)
       (ftu/get-field ?line 3 :> ?val)
       (make-trained-json ext-name ?val :> ?json))
  )

(defn make-training-set
  [ext count]
  (let [base-dir (str qry/results-dir "/" (as-str ext))
        date-dir (str base-dir "/" ftu/date-str)
        core-match-dir (str date-dir "/core.match")
        train-set-dir (str base-dir "/training.set")
        src-urls-dir (str date-dir "/src.urls")]
    (ftu/hadoop-delete src-urls-dir)
    (qry/copy-uuid-urls (hfs-textline (str date-dir "/src.urls"))
                        (hfs-textline qry/sample-uuid-url-dir))
    (run-core-match ext core-match-dir)
    (random-entries core-match-dir train-set-dir count)
  ))

(defn convert-any-train-set
  [att in out new-att]
  (let [base-dir (str qry/results-dir "/" (as-str att))
        in-dir (str base-dir "/" in)
        out-dir (str base-dir "/" out)]
    (convert-train-set-q out-dir in-dir (as-str new-att))
    )
  )

(defn convert-train-set
  [train-ext]
  (let [json-train-dir (str qry/results-dir "/" (as-str train-ext) "/json.train.val")
        raw-train-dir (str qry/results-dir "/" (as-str train-ext) "/training.set")]
    (convert-train-set-q json-train-dir raw-train-dir (str (as-str train-ext) "-gold"))
  ))

(defn bool-ext-val
  "Get the extractor value for a string"
  [ext-key string]
  (let [extractor (ext-key tbl/*extractor-table*)
        classifier (:classifier extractor)
        value (apply classifier (list string))]
    (cond (= value "[1,0,0]") true
          (= value "[0,1,0]") false
          (= value "[0,0,1]") "no-evidence"
          true value)
    ))

(defn bool-match-context
  "Get the validator value for a string"
  [ext-key string]
  (let [extractor (ext-key tbl/*extractor-table*)
        validator (:validator extractor)]
    (apply validator (list string))
   ))

(defn ext-val
  [ext-key string]
  (let [extractor (ext-key tbl/*extractor-table*)
        classifier (:classifier extractor)]
    (apply classifier (list string))
   ))

(defn match-context
  [ext-key string]
  (let [extractor (ext-key tbl/*extractor-table*)
        validator (:validator extractor)]
    (apply validator (list string))
   ))

(defn re-val-match
  "Return true if a raw extractor value matches another given value.
   If the extractor value is a json map, return true"
  [ext-str re-value good-val]
  (let [extractor (tbl/*extractor-table* (keyword ext-str))
        comparer (:comparer extractor)]
    (apply comparer (list re-value good-val))
    )
  )

(defn seed-uuids-q
  "Get the rows from the src.urls directory where the UUID's value in the raw.ext.val
   directory is a particular value"
  [sink-dir raw-ext-dir src-url-dir good-val]
  (println)
  (println "Building seed directory")
  (println)
  (ftu/hadoop-delete sink-dir)
  (?<- (hfs-textline sink-dir) [?uuid ?url]
       ((hfs-textline raw-ext-dir) ?re-line)
       (ftu/get-field ?re-line 0 :> ?uuid)
       (ftu/get-field ?re-line 2 :> ?ext)
       (ftu/get-field ?re-line 3 :> ?re-value)
       (re-val-match ?ext ?re-value good-val)
       ((hfs-textline src-url-dir) ?su-line)
       (ftu/get-field ?su-line 0 :> ?uuid)
       (ftu/get-field ?su-line 1 :> ?url))
  )

(defn seed-contexts
  "1. Make a seed directory, which is all the UUID-URL pairs where the UUID
      has the given value for the given attribute

   2. Put authoring info (value, prefix, core, suffix) for the seed UUID-URLs
      into <att>.good.match
   3. Put authoring info for all UUID-URLs into <att>.all.match"
  [ext-key value]
  (let [use-date ftu/date-str
        ext-str (as-str ext-key)
        base-dir (str qry/results-dir "/" ext-str)
        date-dir (str base-dir "/" use-date)
        src-url-dir (str date-dir "/src.urls")
        raw-ext-dir (str date-dir "/raw.ext.val")
        seed-dir (str base-dir "/auth.seed")
        loc-all-match (str ext-str "." value ".all.match")
        core-dir (str base-dir "/all.match")
        loc-good-match (str ext-str "." value ".good.match")
        sugg-dir (str base-dir "/good.match")]
    (if (not (= (ftu/hadoop-ls raw-ext-dir) 0))
      (qry/run-ft {:ext ext-key}))
    (seed-uuids-q seed-dir raw-ext-dir src-url-dir value)
    (core-match-q sugg-dir seed-dir ext-str)
    (ftu/local-delete loc-good-match)
    (ftu/hadoop-copy-results sugg-dir loc-good-match)
    (core-match-q core-dir src-url-dir ext-str)
    (ftu/local-delete loc-all-match)
    (ftu/hadoop-copy-results core-dir loc-all-match)
    ))

(defn incr-map
  [map key]
  (assoc map key (+ 1 (or (map key) 0)))
  )

(defn tokenize-affix
  [affix]
  (let [tokens (re-split #"\b" affix)
        first-token (first tokens)
        last-token (last tokens)
        tokens2 (if (or (= first-token "")
                        (= first-token " "))
                  (rest tokens)
                  tokens)
        tokens3 (if (or (= last-token " ")
                        (= last-token ""))
                  (butlast tokens2)
                  tokens2)]
    tokens3
  ))

(defn collapse-first-tokens
  [token-list]
  (let [first-token (first token-list)
        second-token (second token-list)]
    (if (= second-token " ")
      (cons (str first-token second-token (nth token-list 2)) (rest (rest (rest token-list))))
      (cons (str first-token second-token) (rest (rest token-list))))
  ))

(defn parse-affix-line
  [tokens-left aff-maps]
  (let [first-token (first tokens-left)
        freq (:counts aff-maps)
        new-freq (incr-map freq first-token)
        parents (:parents aff-maps)]
    (if (= (count tokens-left) 1)
      {:counts new-freq
       :parents parents}
      (let [new-tokens-left (collapse-first-tokens tokens-left)]
        (recur new-tokens-left
               {:counts new-freq
                :parents (assoc parents (first new-tokens-left) first-token)})))
  ))

(defn parse-suffix-line
  [line suff-maps]
  (let [suff-tokens (if (= (count line) 7)
                      (tokenize-affix (nth line 6)))]
    (parse-affix-line suff-tokens suff-maps)
 ))

(defn parse-prefix-line
  [line pref-maps]
  (let [pref-tokens (reverse (tokenize-affix (nth line 4)))]
    (parse-affix-line pref-tokens pref-maps)
  ))

(defn parse-ctxt-file
  [ctxt-file]
  (let [raw-string (slurp ctxt-file)
        raw-lines (re-split #"\n" raw-string)
        split-lines (map #(re-split #"\t" %) raw-lines)]
    (loop [lines split-lines
           pref-maps {:counts {} :parents {}}
           suff-maps {:counts {} :parents {}}]
      (if (empty? lines)
        (vector pref-maps suff-maps)
        (recur (rest lines)
               (parse-prefix-line (first lines) pref-maps)
               (parse-suffix-line (first lines) suff-maps))
        )
      )
    )
  )

(defn discrim-parent?
  [affix good-counts all-counts parents]
  (let [parent (parents affix)
        good-count (good-counts parent)
        all-count (all-counts parent)]
    (and parent (= good-count all-count)))
  )

(defn prune-affixes
  "return a copy of good-counts that contains only those
   whose parent is not categorically discriminative"
  [good-counts all-counts parents]
  (loop [pruned-counts {}
         rest-good good-counts]
    (if (empty? rest-good)
      pruned-counts
      (let [next-good (first rest-good)
            affix (first next-good)
            good-count (second next-good)
            all-count (all-counts affix)]
        (if (discrim-parent? affix good-counts all-counts parents)
          (recur pruned-counts (rest rest-good))
          (recur (assoc pruned-counts affix {:good-count good-count
                                             :all-count all-count})
                 (rest rest-good))
          )
        ))
    )
  )

(defn immediate-descendent
  "Find the immediate descendent of ancestor which is
   an ancestor of descendent"
  [ancestor descendent parents]
  (let [parent (parents descendent)]
    (cond (= parent ancestor) descendent
          (= parent nil) nil
          true (recur ancestor parent parents))
  ))

(defn is-ancestor?
  [ancestor descendent parents]
  (cond (= ancestor descendent) true
        (nil? descendent) false
        true (recur ancestor (parents descendent) parents))
  )

(defn discriminators
  [affix1 affix2 parents]
  (let [parent1 (parents affix1)
        parent2 (parents affix2)]
    (cond (= parent1 parent2) [affix1 affix2]
          (nil? parent1) (recur affix1 parent2 parents)
          (nil? parent2) (recur parent1 affix2 parents)
          (is-ancestor? parent1 parent2 parents) [affix1 (immediate-descendent parent1 parent2 parents)]
          (is-ancestor? parent2 parent1 parents) [(immediate-descendent parent2 parent1 parents) affix2]
          true (recur parent1 parent2 parents))
  ))

(defn compare-affixes
  [affix1 affix2 suggs parents]
  (cond (is-ancestor? affix1 affix2 parents) -1
        (is-ancestor? affix2 affix1 parents) 1
        true (let [discriminators (discriminators affix1 affix2 parents)]
               (if (nil? discriminators)
                 (let [good-count1 (:good-count (suggs affix1))
                       good-count2 (:good-count (suggs affix2))]
                   (if (= good-count1 good-count2)
                     (compare affix1 affix2)
                     (compare good-count2 good-count1)))
                 (let [disc1 (first discriminators)
                       disc2 (second discriminators)
                       disc1-map (suggs disc1)
                       disc2-map (suggs disc2)
                       aff1-ratio (/ (:good-count disc1-map) (:all-count disc1-map))
                       aff2-ratio (/ (:good-count disc2-map) (:all-count disc2-map))]
                   (if (= aff1-ratio aff2-ratio)
                     (compare (:good-count disc2-map) (:good-count disc1-map))
                     (compare aff2-ratio aff1-ratio))
                   ))
               ))  
  )

(defn sorted-affixes
  [suggs parents]
  (sort #(compare-affixes (first %1) (first %2) suggs parents) suggs)
  )

(defn render-prefix
  [prefix-entry]
  (let [prefix (first prefix-entry)
        good-count (:good-count (second prefix-entry))
        all-count (:all-count (second prefix-entry))]
    (if prefix
      (str good-count " / " all-count "\t" (apply str (reverse (re-split #"\b" prefix))) "\n"))
    ))

(defn render-suffix
  [suffix-entry]
  (let [suffix (first suffix-entry)
        good-count (:good-count (second suffix-entry))
        all-count (:all-count (second suffix-entry))]
    (if suffix
      (str good-count " / " all-count "\t" suffix "\n"))
    ))

(defn sugg-from-files
  [good-ctxt-file all-ctxt-file]
  (let [good-analysis (parse-ctxt-file good-ctxt-file)
        all-analysis (parse-ctxt-file all-ctxt-file)
        pref-good-maps (first good-analysis)
        pref-good-counts (:counts pref-good-maps)
        pref-good-parents (:parents pref-good-maps)
        suff-good-maps (second good-analysis)
        suff-good-counts (:counts suff-good-maps)
        suff-good-parents (:parents suff-good-maps)
        pref-all-maps (first all-analysis)
        pref-all-counts (:counts pref-all-maps)
        pref-all-parents (:parents pref-all-maps)
        suff-all-maps (second all-analysis)
        suff-all-counts (:counts suff-all-maps)
        suff-all-parents (:parents suff-all-maps)
        pref-parents (merge pref-good-parents pref-all-parents)
        suff-parents (merge suff-good-parents suff-all-parents)
        pruned-prefixes (prune-affixes pref-good-counts pref-all-counts pref-parents)
        pruned-suffixes (prune-affixes suff-good-counts suff-all-counts suff-parents)
        sorted-prefixes (sorted-affixes pruned-prefixes pref-parents)
        sorted-suffixes (sorted-affixes pruned-suffixes suff-parents)]
    {:pruned-prefixes pruned-prefixes
     :pref-parents pref-parents
     :sorted-prefixes sorted-prefixes
     :pruned-suffixes pruned-suffixes
     :suff-parents suff-parents
     :sorted-suffixes sorted-suffixes}
  ))

(defn sugg-from-seed
  [ext value]
  (let [good-ctxt-file (str ext "." value ".good.match")
        all-ctxt-file (str ext "." value ".all.match")
        sugg-maps (sugg-from-files good-ctxt-file all-ctxt-file)
        sorted-prefixes (:sorted-prefixes sugg-maps)
        sorted-suffixes (:sorted-suffixes sugg-maps)
        pref-sugg-file (str ext "." value ".sugg.by.pref")
        suff-sugg-file (str ext "." value ".sugg.by.suff")]
    (spit pref-sugg-file
          (apply str (map render-prefix sorted-prefixes)))
    (spit suff-sugg-file
          (apply str (map render-suffix sorted-suffixes)))
  ))

(defn context-suggestions
  [ext value]
  (seed-contexts ext value)
  (sugg-from-seed (as-str ext) value)
  )