(ns integral.feed
  "Read the Integral feed"
  (:require [kixipipe.protocols         :as kixi]
            [kixipipe.misc              :as misc]
            [kixipipe.digest            :as digest]
            [kixipipe.transport.sftp    :as sftp]
            [kixipipe.ioplus            :as ioplus]
            [kixipipe.data.scrub        :as scrub]
            [clojure.set                :as set]
            [clojure.java.io            :as io]
            [clj-time.format            :as tf]
            [clojure.string             :as str]
            [com.stuartsierra.component :as component]
            [schema.core                :as s]
            [clojure.tools.logging      :as log]
            [potemkin]))

(potemkin/import-macro kixipipe.transport.sftp/with-session)

(def ^:private integral-date-formatter (tf/formatter "yyyyMMdd"))

(def ^:private SRC_NAME "integral")
(def ^:private FEED_NAME_OVERRIDES {"integral" "5972"})
(def ^:private REV_FEED_NAME_OVERRIDES (set/map-invert FEED_NAME_OVERRIDES))

(def KNOWN_FEEDS #{"integral"})

(defn scrub-null [row]
  (mapv #(str/replace % #"(?i)\b(null|N/A)\b" "") row))

(defn discard-bad-ids
  "Removes invalid characters from integer columns."
  [row]
  (let [number-or-nil (fn [r n]
           (if-let [id (re-matches #"\d+" (nth r n))]
             (assoc r n id)
             (assoc r n "")))
        ext_adnetwork_id-field 6
        ext_campaign_id-field  7]
    (-> row
        (vec)
        (number-or-nil ext_adnetwork_id-field)
        (number-or-nil ext_campaign_id-field))))

(defn scrub-bad-substitutions [row]
  (mapv #(str/replace % #"\$\{.*?\}" "") row))

(defn scrub-urls
  "Truncates urls longer than 450 chars. Skips rows with no or little data."
  [row]
  (-> row
      (vec)
      (update 4 (fnil #(str/replace % #"(.{450}).*" "$1") ""))))

(defn scrub
  "scrubs problematic data from integral feed"
  [in-item out-item]
           (scrub/scrub-csv in-item out-item (scrub/each-row (comp scrub-null
                                                                   scrub-urls
                                                                   scrub-bad-substitutions
                                                                   discard-bad-ids))))
(defn- apply-name-overrides [item]
  (let [{:keys [feed-name]} item
        new-name (get REV_FEED_NAME_OVERRIDES feed-name feed-name)]
    (assoc item :feed-name new-name)))

(declare download-integral-item!)
(defrecord IntegralFeedItem [src-name feed-name dir filename checksum metadata submission-time]
  kixi/FeedItem
  (download! [this session]
    (download-integral-item! session this))
  io/Coercions
  (as-file [item] (io/file (:dir item) (:filename item)))
  (as-url [item] (.toURI (io/as-file item))))

(declare integral-feed-details)
(defrecord IntegralSession [config]
  component/Lifecycle
  (start [this]
    (println "Starting Integral Session.")
    this)
  (stop [this]
    (println "Stopping Integral Session.")
    this)
  kixi/FeedSession
  (feed-details [this feed-name options]
    (integral-feed-details this feed-name options)))

(defn- valid? [date]
  (try
    (tf/parse integral-date-formatter date)
    (catch IllegalArgumentException e
      nil)))

(defn- md5-value [session dir filename]
  (first (str/split (sftp/sftp-get-as-string session (str dir "/" filename ".md5")) #"\s+")))

(def ^:private Config {:download-dir (s/pred ioplus/exists-as-dir? "<a directory>")
                       :host    String
                       :user    String
                       :ssh-key (s/pred ioplus/exists-as-file?)
                       (s/optional-key :dir) String
                       s/Keyword String})

(defn mk-session
  "Creates an integral session with the given config."
  [{:keys [host user ssh-key dir download-dir] :as config}]
  (s/validate Config config)
  (map->IntegralSession config))

(defn- filename-regex [f]
  (re-matches #"(\w+)_(\d{1,2}_\d{1,2}_\d{4})\.dat.csv" f))

(defn- filename->item [f]
  (if-let [[_ feed-name date-str] (filename-regex f)]
    {:src-name SRC_NAME
     :filename f
     :feed-name (get REV_FEED_NAME_OVERRIDES feed-name feed-name)
     :date (tf/parse integral-date-formatter date-str)}))

(defn- reverse-sort-by-timestamp [xs]
  (sort-by (comp :timestamp :metadata)
           (fn [x y] (.compareTo y x))
           xs))

(defn- feed-item-regex [filename]
  (re-matches  #"(\d{4})_(\d{8})_(\d{2}).dat.gz" filename))

(defn- ->integral-feed-item [session options]
  (fn [{:keys [dir filename attrs]}]
    (if-let [match (feed-item-regex filename)]
      (let [[filename name date] match]
        (when-let [date (valid? date)]
          (map->IntegralFeedItem
           (merge options
                  {:src-name SRC_NAME
                   :encoding :gzip
                   :feed-name name
                   :date date
                   :dir dir
                   :checksum (md5-value session dir filename)
                   :filename filename
                   :delimiter \u0001
                   :metadata {:timestamp (:mtime attrs)}})))))))

(defn- integral-feed-details
  "Retrieve details of feed, restricted by name and/or date."
  [session feed-name options]
  (let [{:keys [date]} options
        dir (-> session :dir)]
    (log/info "Retrieving details for feed " feed-name " with options " options)
    (-> (keep (->integral-feed-item session options) (rest (sftp/sftp-file-seq session dir)))
        (reverse-sort-by-timestamp)
        (cond->> date (filter #(= date (:date %)))))))

(defn- download-integral-item!
  "will download the feed item detailed by item and enrich the item with further details.
   Can be used to map over the results of feed-details.

   ```
   (doall (map (partial do-download! session) (feed-details ...)
   ```"
  [session item]
  (log/info "Downloading " item)
  (sftp/sftp-get-to-file session (apply-name-overrides item)))
