(ns morri.affy-annot
  (:require [morri.meth450k.common.command-line :as cli]
            [morri.meth450k.common.utils :as utils :refer [tprn]]
            [morri.meth450k.common.db-utils :as db-utils]
            [clojure.data.csv :as csv]
            [clojure.java.io :as io]
            [clojure.string :as str]
            [clojure.pprint :refer [pprint]]
            [clojure.java.jdbc :as jdbc]
            [java-jdbc.ddl :as ddl]
            [clojure.walk :as walk]
            [clojure.edn :as edn]
            [monger.core :as mg]
            [monger.collection :as mc]
            [monger.multi.collection :as mmc]
            [monger.operators :refer :all]
            [clojure.repl :refer [doc]]
            [clojure.set :as set])
  (:gen-class))

(def transcript-column-headers
  [:tc-id
   :ps-id
   :seqname
   :strand
   :start
   :stop
   :total-probes
   :gene-assignment
   :mrna-assignment
   :swiss-prot
   :unigene
   :go-biological-process
   :go-cellular-component
   :go-molecular-function
   :pathway
   :protein-domains
   :category])

(def go-multipart [:accession :go-id :go-term :go-evidence])

(def transcript-multipart-schema
  {:gene-assignment [:accesion :gene-symbol :gene-title :cytoband :entrez-gene-id]
   :mrna-assignment [:accession :source-name :description :assignment-seqname
                     :assignment-score :assignment-coverage :direct-probes
                     :possible-probes :assignment-xhyb]
   :swiss-prot [:accession :swiss-prot-accession]
   :unigene [:accession :unigene-id :unigene-expr]
   :go-biological-process go-multipart
   :go-cellular-component go-multipart
   :go-molecular-function go-multipart
   :pathway [:accession :source :pathway-name]
   :protein-domains [:accession :source :pfam-accession :domain-description]})

(def transcript-integer-fields
  [; from the main level
   :tc-id
   :ps-id
   :start
   :stop
   :total-probes
   ;; from :gene-assignment:
   :entrez-gene-id
   ;; from :mrna-assignment:
   :assignment-score
   :assignment-coverage
   :direct-probes
   :possible-probes
   :assignment-xhyb])

(def options-config [["-h" "--help"
                      "Parse Affy transcript.csv and probeset.csv annotation files"
                      :default false :flag true]
                     ["-i" "--input-file" "Affymetrix transcript.csv"]
                     ["-d" "--mongo-db-name" "MongoDB Database"]
                     ["-c" "--mongo-coll" "MongoDB Collection"]])

(defn multipart-merge-fn [schema field-str]
  (when field-str
    (vec (map #(zipmap schema (str/split % #"\s*//\s*"))
              (str/split field-str #"\s*///\s*")))))

(defn postwalk-val-fn
  "Walk a tree, if x is a map containing lookup-key, update the value
  at lookup-key using the function f"
  [f x lookup-key]
  (let [update-fn (fn [m]
                   (if (and (map? m) (lookup-key m))
                     (update-in m [lookup-key] f)
                     m))]
    (walk/postwalk update-fn x)))

(defn update-in-tree-at-keys
  "Apply f to the values looked up by keys in key-coll within the
  arbitrarily nested map m"
  [key-coll f m]
  (reduce (partial postwalk-val-fn f) m key-coll))

(defn num->bool [n]
  (if (= 1 n) true false))

(defn split-unigene-expr [s]
  (str/split s #"\|\s*"))

(defn drop-affy-header-csv [a]
  (drop 1 (drop-while #(re-find #"^#" (first %)) a)))


;; see ExonArray_NetAffx-CSV-Files.README.txt

;; this whole thing runs really slow, but so far I haven't found a hot
;; spot, obviously all the tree walking is not helping.  Expect about
;; an hour or so to read in transcript.csv.  If I alter it to work
;; with probeset.csv it might take overnight.

(defn affy-annot
  [{:keys [input-file mongo-db-name mongo-coll]}]
  (with-open [rdr (io/reader input-file)]
    (let [col-headers transcript-column-headers
          multipart-schema transcript-multipart-schema
          integer-fields transcript-integer-fields
          as-array (utils/show-progress 100 (drop-affy-header-csv
                                             (csv/read-csv rdr)))
          ;; the following are all maps so that we can keep the
          ;; sequence lazy at the line level
          tc-data (->> as-array
                       (map (partial zipmap col-headers))
                       (map #(merge-with multipart-merge-fn multipart-schema %))
                       (map #(walk/postwalk-replace {"---" nil} %))
                       (map #(update-in-tree-at-keys
                              integer-fields
                              edn/read-string
                              %))
                       (map #(update-in-tree-at-keys
                              [:assignment-xhyb]
                              num->bool
                              %))
                       (map #(update-in-tree-at-keys
                              [:unigene-expr]
                              split-unigene-expr
                              %)))
          mg-db (mg/get-db (mg/connect) mongo-db-name)]
      (mmc/drop mg-db mongo-coll)
      (doseq [row tc-data]
        (mmc/insert mg-db mongo-coll row))
      (pprint (mmc/find-one-as-map mg-db mongo-coll {}))
      (println "Inserted" (mmc/count mg-db mongo-coll)
               "records into" mongo-db-name "/" mongo-coll))))

(defn -main
  "Parse command line arguments and call affy-annot with them"
  [& args]
  (let [options (cli/parse-command-line args options-config)]
    (affy-annot options)))


;;; Below here is a related function to output a unique mapping
;;; between refseq id and Affymetrix transcript cluster.


;; so what we want is a tc => refseq map
(defn drop-affy-header [a]
  (drop 1 (drop-while #(re-find #"^#" %) a)))

(defn read-mps [mps-file]
  (let [mps-lines (drop-affy-header (str/split-lines (slurp mps-file)))
        mps-list (map edn/read-string (map #(first (str/split % #"\t")) mps-lines))]
    (into #{} mps-list)))

(def core-mps-file (io/file "resources"
                            "HuEx-1_0-st-v2"
                            "HuEx-1_0-st-v2.r2.dt1.hg18.core.mps"))
(def comp-mps-file (io/file "resources"
                            "HuEx-1_0-st-v2"
                            "HuEx-1_0-st-v2.r2.dt1.hg18.comprehensive.mps"))
(def ext-mps-file (io/file "resources"
                           "HuEx-1_0-st-v2"
                           "HuEx-1_0-st-v2.r2.dt1.hg18.extended.mps"))
(def full-mps-file (io/file "resources"
                            "HuEx-1_0-st-v2"
                            "HuEx-1_0-st-v2.r2.dt1.hg18.full.mps"))

;; I only need core and comp. Because I did the rna analysis with
;; comp.  Let's check that core is a subset of comp

;; (let [core-mps (read-mps core-mps-file)
;;       comp-mps (read-mps comp-mps-file)
;;       added-in-comp (set/difference comp-mps core-mps)]
;;   (println "There are" (count core-mps) "probes in core")
;;   (println "There are" (count comp-mps) "probes in comp")
;;   (println "There are" (count (set/intersection core-mps comp-mps)) "probes in both")
;;   (println "It is" (= (set/intersection core-mps comp-mps) core-mps)
;;            "that all the probes in core are in comp")
;;   (println "There are" (count added-in-comp) "extra probes in the comp"))

(defn tc-priority
  "mps-sets-in-order is an array of sets.  Each set is a set of probesets,
in order of priority.  Return the highest ranking match in a or the first
match in the case of a tie"

  [mps-sets-in-order a]
  (let [matches (for [mps mps-sets-in-order]
                  (filter #(contains? mps %) a))]
    (first (flatten matches))))

;; Right now (in ruby) I'm taking tc -> refseq, when the refseq is the
;; first entry in mrna-assignment and xhyb is false.  Then I'm
;; essentially inverting the hash while giving priority to the core
;; and then the comprehensive probeset when a refseq points to more
;; than one tc.  At least this way I know that only one refseq points
;; to a tc.  I could also invert the other entries in mrna-assignment
;; to get more nm->tc options, but then I would have multiple nm
;; potentially pointing to a single tc.  Currently, I'm interested in
;; unique tss so this approach seems appropriate

;; redo this making first a tc->[nm] with all the good refseq
;; assignments for each tc.  Good is? :assignment-xhyb false,
;; assignment-score 100 then for each nm we want to order the tcs by
;; coverage.  Does this mean looking up the NM in each tc and then
;; ordering the tcs by coverage?  Perhaps we can do this somewhat
;; quickly, since we already have all the results in memory.

(defn good-assignment? [m]
  (and
   (= "RefSeq" (:source-name m))
   (= false (:assignment-xhyb m))
   (= 100 (:assignment-score m))))

(defn lookup-coverage [tc->assignment refseq-id tc-id]
  (first (for [assignment (get tc->assignment tc-id)
               :when (= refseq-id (:accession assignment))]
           (:assignment-coverage assignment))))

(defn sort-by-coverage
  "sort tc-coll by coverage of refseq-id by each tc"
  [tc->assignment [refseq-id tc-coll]]
    [refseq-id (sort-by (partial lookup-coverage tc->assignment refseq-id) tc-coll)])

(defn require-max-coverage
  "Lookup the best coverage among the tcs over refseq-id, keep the
  ones whose coverage is the same as the best coverage"
  [tc->assignment [refseq-id tc-coll]]
  (let [max-coverage (apply max
                            (map (partial lookup-coverage
                                          tc->assignment refseq-id)
                                 tc-coll))]
    [refseq-id
     (filter #(= max-coverage (lookup-coverage tc->assignment refseq-id %)) tc-coll)]))

(defn good-tx-as-maps [m]
  (let [mrna-assignments (:mrna-assignment m)
        good-assignments (filter good-assignment? mrna-assignments)
        good-refseqs (map :accession good-assignments)
        tc-id (:tc-id m)]
    (for [good good-refseqs]
      {good [tc-id]})))

(def mongo-db-name "affy")
(def mongo-coll "transcripts")

;; problem: NM_001630 (ANXA8L2) should match to 3245172, but it is
;; missing from nm_to_tc diagnosis, it seems to be missing from
;; comp-set.  I'm not sure why but in the ucsc genome browser this tc
;; is listed as core, but for me it is only available from
;; extended.mps, so so far we are doing OK

;; next looking at NM_001012507, it is only available from the
;; extended probesets.

(defn nm->tc [{:keys [mongo-db-name mongo-coll sqlite-output]}]
  (let [comp-set (read-mps comp-mps-file)
        mg-db (mg/get-db (mg/connect) mongo-db-name)
        db-results (utils/show-progress 1000 (mmc/find-maps
                     mg-db "transcripts"
                     {} [:tc-id :mrna-assignment]))
        comp-only (filter #(comp-set (:tc-id %)) db-results)
        good-tx-maps (flatten (map good-tx-as-maps comp-only))
        nm->tx (apply merge-with (comp distinct into) good-tx-maps)
        tc->assignment (into {} (for [row db-results]
                                  [(:tc-id row) (:mrna-assignment row)]))
        with-max-coverage (map (partial require-max-coverage tc->assignment) nm->tx)
        ;; now give priority to core
        core-set (read-mps core-mps-file)
        added-in-comp (set/difference comp-set core-set)
        priority-filtered (map
                           (fn [[k v]] [k (tc-priority [core-set added-in-comp] v)])
                           with-max-coverage)
        sqlite-db (db-utils/sqlite-db sqlite-output)
        sqlite-table :nm_to_tc]
    (db-utils/try-drop-table sqlite-db sqlite-table)
    (jdbc/db-do-commands sqlite-db (ddl/create-table
                                    sqlite-table
                                    [:refseq "text"]
                                    [:tc_id "integer"]))
    (apply jdbc/insert! sqlite-db sqlite-table [:refseq :tc_id] priority-filtered)
    (pprint priority-filtered)
    (db-utils/count-rows sqlite-db sqlite-table)))

(defn run-me []
  (nm->tc {:mongo-db-name "affy"
           :mongo-coll "transcripts"
           :sqlite-output "results/nm_to_tc.sqlite"}))

;; make a new ens->tc that will work with enst and ensg



;; acc->tc-id (map (fn [[k v]]))
;; acc->tc-id (map (fn [m] {(get-in m [:mrna-assignment 0 :accession]) [(:tc-id m)]})
;;                 db-result)
;; nm->tc-hash (apply merge-with into acc->tc-id)
;; core-mps (read-mps core-mps-file)
;; comp-mps (read-mps comp-mps-file)
;; added-in-comp (set/difference comp-mps core-mps)
;; priority-filtered (map (fn [[k v]] [k (tc-priority [core-mps comp-mps] v)])
;;                        nm->tc-hash)
;; nil-filtered (filter (comp not nil? second) priority-filtered)
    ;;     sqlite-db (db-utils/sqlite-db sqlite-output)
;;     sqlite-table :nm_to_tc
;; (println (count nm->tc-hash) "refseqs")
;; (println (count nil-filtered) "after filtering for presence in core or comp")
;; (db-utils/try-drop-table sqlite-db sqlite-table)
;; (jdbc/db-do-commands sqlite-db (ddl/create-table
;;                                 sqlite-table
;;                                 [:refseq "text"]
;;                                 [:tc_id "integer"]))
;; (apply jdbc/insert! sqlite-db sqlite-table [:refseq :tc_id] nil-filtered)
;; (db-utils/count-rows sqlite-db sqlite-table)
