(ns csv-export-bolt.storm
  (:require [backtype.storm
             [clojure :refer :all]
             [log :refer :all]]
            [archive-bolt.backends.core :refer [store]]
            [clojure.string :as s]
            [clojure.java.io :as io]
            [csv-export-bolt.fields :refer [csv-export-output-fields]])
  (:import [backtype.storm Constants]))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Helpers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(defn dissoc-in
  [m [k & ks]]
  (if-not ks
    (dissoc m k)
    (assoc m k (dissoc-in (m k) ks))))

(defn hive-escape [raw]
  "Escape special characters of v using a backslash."
  (s/escape (str raw) {\, "\\,"
                       \newline "\\n"
                       \return "\\r"
                       \tab "\\t"
                       \; "\\;"
                       \" "\\\""
                       \' "\\'"}))

(defn v->hive-vec
  "Convert each item in v to a hive-escaped coma separated string, and append newline."
  [v]
  (let [escaped (map hive-escape v)]
    (str (s/join "," escaped) \newline))) ;; newline since we'll be writing to a file

(defn coll->csv
  "Converts coll to csv. If batch is false, coll is expected to be a vector representing a line.
  If batch is true coll is expected to be a vector of vectors representing multiple lines."
  [coll b]
  (let [coll* (if b coll [coll]) ;; wrap in vector when not in batch mode
        cleaned (map v->hive-vec coll*)]
    (s/join cleaned)))

(defn gen-tmp-file
  "Generates a temporary file with the given prefix and suffix.
   Prefix should have a trailing underscore. Returns the path to the file.

   root-path is optional and will default the jvm temp file path. This is
   useful for storing temp files on another device or secure location.

   Example:
   (gen-tmp-file \"retweets_\" \".csv\" \"/tmp/files/here\")"
  [prefix suffix root-path]
  (let [directory (when root-path (java.io.File. root-path))
        tmp-file (java.io.File/createTempFile prefix suffix directory)]
    (.getAbsolutePath tmp-file)))

(defn get-or-create-file
  "Get or create a file for the given partition-key in state atom.
   If the key is not found, generates a new file and adds it
   to the state. Returns a vector of file path and created boolean."
  [tmp-file-path partition-key conf]
  (if-let [existing-file (get tmp-file-path partition-key)]
    [existing-file false]
    (let [root-path (get conf "CSV_EXPORT_TEMP_FILE_PATH")
          file-path (gen-tmp-file "batches_" ".csv" root-path)]      
      [file-path true])))

(defn tick-tuple? [tuple]
  (= (.getSourceStreamId tuple) (Constants/SYSTEM_TICK_STREAM_ID)))

(defn generate-unique-file-path
  "Returns a string of a file path based on the partition key and a uuid"
  [partition-key]
  (format "%s/%s.csv" partition-key (str (java.util.UUID/randomUUID))))

(defn flush-tmp-files
  [conf collector tuple files-hm storage-backend]
  (doseq [[partition-key file-path] files-hm]
    (log-message "Flushing temp file: " partition-key)
    (let [csv-content (slurp file-path)
          output [partition-key csv-content]]
      (when-not (empty? csv-content)
        (if storage-backend
          ;; Store the contents of the file to the backend data store
          (store storage-backend conf (generate-unique-file-path partition-key) csv-content)
          ;; Emit content of temp files for export
          (emit-bolt! collector output :anchor tuple))))
    ;; Delete the temp file
    (log-message "Deleting temp file: " partition-key)
    (io/delete-file file-path)))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Bolt
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Take [partition-key coll]
;; serializes to csv in Hive format.
;; Accumulates results in temporary files based on partition-key
;; Emits the csv-content by recieving a tick from a time interval spout

;; Args:
;; - batch: a boxed Java boolean (must be converted!) of whether this
;;   is a single or batch write

;; Notes:
;; This bolt can take a tick tuple or a 'content' tuple
;; We can assume that the IO is safe as each bolt is a
;; separate process and will not run more than 1 tuple at a
;; time with the given state

(defn csv-export-body
  "If this is a tick tuple then flush temp files, otherwise append the
   csv results to a temporary file."
  [conf context collector batch storage-backend]
  (let [tmp-file-paths (atom {})]
    (bolt
     (execute
      [tuple]
      (if (tick-tuple? tuple)
        (do (flush-tmp-files conf collector tuple @tmp-file-paths storage-backend)
            (reset! tmp-file-paths {}))
        ;; else append content to temp file
        (let [{:keys [partition-key coll]} tuple
              ;; Get the tmp file path to accumulate results into
              [file-path created?] (get-or-create-file @tmp-file-paths
                                                       partition-key
                                                       conf)
              csv-content (coll->csv coll batch)]
          (when created? (swap! tmp-file-paths assoc partition-key file-path))
          (log-message "Appending to file " file-path)
          (spit file-path csv-content :append true)))
      (ack! collector tuple)))))

(defmacro defcsvexport
  "Returns a csv-export bolt with the tick tuple frequency in seconds set.

   Args:
   - batch: Boolean of whether the incoming data is a collection of rows or a single row
   - frequency: Time between flushing temp files in seconds, defaults to 300
   - storage-backend: Name of a storage backend supported by com.shareablee/archive-bolt
     This will skip emitting the accumulated data and store it directly. This is the
     preferred way of accumulating large amounts of data to avoid OOM errors."
  [var-name & {:keys [batch frequency storage-backend]
               :or {batch true
                    frequency 300
                    storage-backend nil}}]
  `(defbolt ~var-name csv-export-output-fields
     {:prepare true
      :conf {"topology.tick.tuple.freq.secs" ~frequency}}
     [conf# context# collector#]
     (csv-export-body conf# context# collector# ~batch ~storage-backend)))

;; To maintain backwards compatibility, create a default csv-export
;; with 300 second tick tuple frequency
(defcsvexport csv-export :batch true :frequency 300)

