(ns integral.feed
  "Read the Integral feed"
  (:require [kixipipe.protocols         :as kixi]
            [kixipipe.misc              :as misc]
            [kixipipe.digest            :as digest]
            [kixipipe.transport.sftp    :as sftp]
            [kixipipe.ioplus            :as ioplus]
            [clojure.set                :as set]
            [clojure.java.io            :as io]
            [clj-time.format            :as tf]
            [clojure.string             :as str]
            [com.stuartsierra.component :as component]
            [schema.core                :as s]
            [clojure.tools.logging      :as log]
            [potemkin]))

(potemkin/import-macro kixipipe.transport.sftp/with-session)

(def ^:private integral-date-formatter (tf/formatter "yyyyMMdd"))

(def ^:private SRC_NAME "integral")
(def ^:private FEED_NAME_OVERRIDES {"integral" "5972"})
(def ^:private REV_FEED_NAME_OVERRIDES (set/map-invert FEED_NAME_OVERRIDES))

(defn scrub-null
  [row]
  (let [remove-nulls (fn [col] (-> col
                                   (str/replace "null" "")
                                   (str/replace "N/A" "")))]
    (map #(remove-nulls %) row)))

(defn discard-bad-ids
  "Removes invalid characters from integer columns."
  [row]
  (let [f (fn [r n]
           (if-let [id (re-find #"\d+" (nth r n))]
             (assoc r n id)
             (assoc r n 0)))]
       (-> row
           (f 6))))

(defn scrub-urls
  "Truncates urls longer than 512 chars. Skips rows with no or little data."
  [row]
  (let [url-length (count (nth row 4))]
    (if (> url-length 512)
      (update-in row [4] #(subs % 0 450))
      row)))

(defn- apply-name-overrides [item]
  (let [{:keys [feed-name]} item
        new-name (get REV_FEED_NAME_OVERRIDES feed-name feed-name)]
    (assoc item :feed-name new-name)))

(declare download-integral-item!)
(defrecord IntegralFeedItem [src-name feed-name dir filename checksum metadata]
  kixi/FeedItem
  (download! [this session]
    (download-integral-item! session this))
  io/Coercions
  (as-file [item] (io/file (:dir item) (:filename item)))
  (as-url [item] (.toURI (io/as-file item))))

(declare integral-feed-details)
(defrecord IntegralSession [config]
  component/Lifecycle
  (start [this]
    (println "Starting Integral Session.")
    this)
  (stop [this]
    (println "Stopping Integral Session.")
    this)
  kixi/FeedSession
  (feed-details [this feed-name options]
    (integral-feed-details this feed-name options)))

(defn- valid? [date]
  (try
    (tf/parse integral-date-formatter date)
    (catch IllegalArgumentException e
      nil)))

(defn- md5-value [session dir filename]
  (first (str/split (sftp/sftp-get-as-string session (str dir "/" filename ".md5")) #"\s+")))

(def ^:private Config {:download-dir (s/pred ioplus/exists-as-dir? "<a directory>")
                       :host    String
                       :user    String
                       :ssh-key (s/pred ioplus/exists-as-file?)
                       (s/optional-key :dir) String
                       s/Keyword String})


(defn mk-session
  "Creates an integral session with the given config."
  [{:keys [host user ssh-key dir download-dir] :as config}]
  (s/validate Config config)
  (map->IntegralSession config))

(defn- filename-regex [f]
  (re-matches #"(\w+)_(\d{1,2}_\d{1,2}_\d{4})\.dat.csv" f))

(defn- filename->item [f]
  (if-let [[_ feed-name date-str] (filename-regex f)]
    {:src-name SRC_NAME
     :filename f
     :feed-name (get REV_FEED_NAME_OVERRIDES feed-name feed-name)
     :date (tf/parse integral-date-formatter date-str)}))

(defn- reverse-sort-by-timestamp [xs]
  (sort-by (comp :timestamp :metadata)
           (fn [x y] (.compareTo y x))
           xs))

(defn- feed-item-regex [filename]
  (re-matches  #"(\d{4})_(\d{8})_(\d{2}).dat.gz" filename))

(defn- ->integral-feed-item [session]
  (fn [{:keys [dir filename attrs] :as foo}]
    (if-let [match (feed-item-regex filename)]
      (let [[filename name date] match]
        (when-let [date (valid? date)]
          (map->IntegralFeedItem {:src-name SRC_NAME
                                  :encoding :gzip
                                  :feed-name name
                                  :date date
                                  :dir dir
                                  :checksum (md5-value session dir filename)
                                  :filename filename
                                  :delimiter \u0001
                                  :metadata {:timestamp (:mtime attrs)}}))))))

(defn- integral-feed-details
  "Retrieve details of feed, restricted by name and/or date."
  [session feed-name options]
  (let [{:keys [date]} options
        dir (-> session :dir)]   
    (log/info "Retrieving details for feed " feed-name " with options " options)
    (-> (keep (->integral-feed-item session) (rest (sftp/sftp-file-seq session dir)))
        (reverse-sort-by-timestamp)
        (cond->> date (filter #(= date (:date %)))))))

(defn- download-integral-item!
  "will download the feed item detailed by item and enrich the item with further details.
   Can be used to map over the results of feed-details.

   ```
   (doall (map (partial do-download! session) (feed-details ...)
   ```"
  [session item]
  (log/info "Downloading " item) 
  (sftp/sftp-get-to-file session (apply-name-overrides item)))
