(ns kixipipe.data.scrub
  "Various scrubbing routines"
  (:require [clojure.data.csv     :as csv]
            [clojure.java.io      :as io]
            [clojure.string       :as str]
            [kixipipe.digest      :as digest]
            [kixipipe.ioplus      :as ioplus]
            [kixipipe.data.mutate :as mutate]))

(defn each-cell
  "returns a function that applies f to each cell in a row. Pass this
   to scrub-csv. e.g.
   (scrub-csv item (each-cell (comp scrub-null scrub-zero-dot))"
  [f]
  (fn [cells] (map f cells)))

(defn each-row
  "Returns a function that applies f to a row."
  [f]
  (fn [row]
    (f row)))

(defn scrub-null
  "replaces variants of 'null' with empty string."
  [s]
  (str/replace s #"(?i)^\s*null\s*$" ""))

(defn scrub-zero-dot
  "Replaces '0.' with '0'."
  [s]
  (str/replace s #"^\s*0\.\s*$" "0"))

;; TODO possible loss of precision here.
(defn scrub-engineering
  "Replaces engineering notation (8.0E-4) with the double equivalent."
  [s]
  (if (re-matches #"\s*\d+\.\d+[eE]-?\d+\s*" s)
    (format "%f" (Double/valueOf s))
    s))

(defn- do-scrub [scrub-row-fn in out sep]
  (let [csv (csv/read-csv in :separator sep)]
    (csv/write-csv out (keep scrub-row-fn csv) :separator sep)))

(defn scrub-csv
  "Takes an item (possible gzipped) and invokeds scrub-row-fn on a
  vector of values representing cells in a row. if scrub-row-fn
  returns nil the row will not appear in the output."
  [in-item out-item scrub-row-fn]
  (let [{:keys [feed-name
                dir
                filename
                encoding
                checksum
                delimiter] :or {delimiter \tab}} in-item
        dest               (io/file (:dir out-item) (:filename out-item))]

    (assert feed-name "Must supply a feed-name")
    (assert filename "Must supply a filename")

    (ioplus/mkdirs (:dir out-item))

    (with-open [in      (-> (io/file dir filename)
                            (cond-> (mutate/gzip-item? in-item) (ioplus/gzip-input-stream))
                            io/reader)
                out-md5 (digest/md5-output-stream dest)]
      (with-open [out (-> out-md5
                          (cond-> (mutate/gzip-item? out-item) (ioplus/gzip-output-stream))
                          (io/writer))]
        (do-scrub scrub-row-fn in out delimiter))
      (-> out-item
          (mutate/update-checksums checksum out-md5)))))
