(ns morri.one-tx-per-gene
  (:require [morri.meth450k.common.command-line :as cli]
            [morri.meth450k.common.ucsc-db :as ucsc-db]
            [morri.meth450k.common.utils :as utils :refer [tprn]]
            [clojure.string :as str]
            [honeysql.core :as sql]
            [honeysql.helpers :refer :all])
  (:gen-class))

;; Bring in the tx list, look up the gs for each, do frequencies on
;; the gs.  filter out any tx whose frequency is not 1
; kgXref.geneSymbol

(def gene-model-options #{:wgEncodeGencodeBasicV17
                          :knownGene})

(def options-config
  [["-h" "--help"
    "Keep only transcripts from genes with a single tx in the list?"
    :default false :flag true]
   ["-i" "--input-file" "Transcript input File"]
   ["-o" "--output-file" "Output File"]
   ["-g" "--gene-model"
    (str "Database for gene model, choose from " gene-model-options)
    :default :knownGene
    :parse-fn (cli/validate gene-model-options)]])

(defn lookup-gene-symbol-ucsc [tx]
  (let [query (-> (select :geneSymbol)
                  (from :kgXref)
                  (where [:= :kgID tx])
                  sql/format)
        gs (:geneSymbol (first (ucsc-db/ucsc-query query)))]
    [tx gs]))

;; Gencode uses a  geneId
(defn lookup-gene-id-gencode [tx]
  (let [query (-> (select :geneId)
                  (from :wgEncodeGencodeAttrsV17)
                  (where [:= :transcriptId tx])
                  sql/format)
        gs (:geneId (first (ucsc-db/ucsc-query query)))]
    [tx gs]))

;; Gencode uses a  geneId
(defn lookup-gene-symbol-gencode [tx]
  (let [query (-> (select :geneName)
                  (from :wgEncodeGencodeAttrsV17)
                  (where [:= :transcriptId tx])
                  sql/format)
        gs (:geneName (first (ucsc-db/ucsc-query query)))]
    [tx gs]))

(defn one-tx-per-gene
  [{:keys [input-file output-file gene-model]}]
  (let [input-txs (str/split-lines (slurp input-file))
        lookup-gene-symbol (case gene-model
                             :wgEncodeGencodeBasicV17
                             lookup-gene-id-gencode
                             :knownGene
                             lookup-gene-symbol-ucsc)
        tx->gs (into {} (map lookup-gene-symbol input-txs))
        gs-freqs (frequencies (vals tx->gs))
        one-tx-txs (filter #(= 1 (gs-freqs (tx->gs %))) (keys tx->gs))]
    (utils/csv-write output-file (map vector one-tx-txs))))

(defn -main
  "Parse command line arguments and call one-tx-per-gene with them"
  [& args]
  (let [options (cli/parse-command-line args options-config)]
    (one-tx-per-gene options)))
