; Copyright (c) Sławek Gwizdowski
;
; Permission is hereby granted, free of charge, to any person obtaining
; a copy of this software and associated documentation files (the "Software"),
; to deal in the Software without restriction, including without limitation
; the rights to use, copy, modify, merge, publish, distribute, sublicense,
; and/or sell copies of the Software, and to permit persons to whom the
; Software is furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included
; in all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
; OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
; IN THE SOFTWARE.
;
(ns ^{:author "Sławek Gwizdowski"
      :doc "Essbase columns export.

* Space separated.
* Quoted member names, non-quoted values.
* Max 2GB size.
* COLUMNS are specified in first line of the file:
  - List of quoted member names of single dense dimension, last field is empty.
  - No members from this dimension ever appear in the file again.
  - N members are specified, up to these many figures can appear in data lines.
* DATA line consists of both full POV updates and figures:
  - Quoted members of non-columns dimensions followed by figures.
  - Dimension order in the data lines should always be the same.
  - Figures are non-quoted #Mi and numeric values, up to N occurrences per line.
  - Last field is always empty string, so last figure is followed by space.
  - Missing values from the left are marked as #Mi, from the right - skipped.

To parse the export file you need to know two basic things:

* Number of data storing dimensions in the cube.
* Complete mapping of member name to dimension name in those dimensions.
"}
 szew.essbase.cols
  (:gen-class)
  (:require [szew.io :as io]
            [clojure.java.io :as clj.io :refer [reader]]
            [clojure-csv.core :as csv])
  (:import [java.io BufferedReader]))

(defrecord Members [processor dim-count encoding]
  io/Input
  (io/in! [spec source]
    (when (nil? dim-count)
      (throw (ex-info "Set dim-count first!" {:members spec})))
    (letfn [(rip [line]
              (vec (take (dec dim-count) line)))
            (splice [rows]
              (let [columns (-> rows first vec pop)
                    lines   (rest rows)]
                (concat columns (mapcat rip lines))))]
      (io! "Reading files here!"
           (with-open [^BufferedReader r (reader source :encoding encoding)]
             (processor (splice (csv/parse-csv r :delimiter \space))))))))

(defn members
  "Processor gets member name per every occurrence in file.

  Default processor creates a hash-set of member names.

  It will discard all the figures, provided dim-count is correct, and only
  process members.
  "
  ([]
   (Members. (partial into (hash-set)) nil "UTF-8"))
  ([spec]
   (into (members) spec)))

(defrecord Records [processor m->d dim-count encoding]
  io/Input
  (io/in! [spec source]
    (when (nil? dim-count)
      (throw (ex-info "Set dim-count first!" {:members spec})))
    (letfn [(ok? [record]
              (if (contains? (first record) nil)
                (-> (format "Unknown dimension of: %s" (get (first record) nil))
                    (ex-info {:record    record
                              :data-line (-> record meta :data-line)})
                    (throw))
                record))
            (rip [columns line]
              (let [line-no (first line)
                    data    (second line)
                    nip     (dec dim-count)
                    nipr    (comp vec (partial take nip))
                    tuckr   (comp vec (partial drop nip))]
                (with-meta [(->> (nipr data)
                                 (mapv (juxt m->d identity))
                                 (into (hash-map)))
                            (->> (tuckr data)
                                 (mapv vector columns)
                                 (into (hash-map)))]
                           {:data-line line-no
                            :line      line
                            :source    source})))
            (splice [rows]
              (let [columns (-> rows first vec pop)
                    lines   (->> rows
                                 (map pop)
                                 (map vector (range))
                                 rest)]
                (map ok? (map rip (repeat columns) lines))))]
      (io! "Reading files here!"
           (with-open [^BufferedReader r (reader source :encoding encoding)]
             (processor (splice (csv/parse-csv r :delimiter \space))))))))

(defn records
  "Processor will get data point per each data-line in file.

  Requires complete member to dimension mapping in m->d and correct data storing
  dimension count in dim-count.

  Processor gets seq of [{dimension member} {column value}].

  Default processor will return a vector of such data points.

  Errors out with ex-info if any member is mapped to nil.
  "
  ([]
   (Records. vec {} nil "UTF-8"))
  ([spec]
   (into (records) spec)))

(defrecord Cells [processor m->d dim-count encoding]
  io/Input
  (io/in! [spec source]
    (letfn [(split [record]
              (for [[column value] (second record)]
                (with-meta [(assoc (first record) (m->d column) column) value]
                  (meta record))))
            (splitter [records]
              (processor (mapcat split records)))]
      (io/in! (records (assoc spec :processor splitter)) source))))

(defn cells
  "Processor will get data point per each cell in file.

  Requires complete member to dimension mapping in m->d and correct data storing
  dimension count in dim-count.

  Processor gets seq of [{dimension member} value].

  Default processor will return a vector of cells.

  Errors out with ex-info if any member is mapped to nil.
  "
  ([]
   (Cells. vec {} nil "UTF-8"))
  ([spec]
   (into (cells) spec)))

;; Helper functions -- Cells

(defn sniff-dimensions
  "A processor for Cells that will return dimensions of first cell.
  "
  [cells]
  (into (hash-set) (keys (first (first cells)))))

(defn dump->tsv
  "Given dump files consolidates them into single, row-expanded TSV.

  One row per cell.

  Requires complete member to dimension mapping in m->d and complete list of
  dimension names in order. It will be used to get dim-count right.
  "
  [m->d order out-path in-path & in-paths]
  (let [->row (fn [cell] (conj (mapv (first cell) order) (last cell)))
        sink  (io/sink (io/tsv) out-path)
        suck  (cells {:dim-count (count order)
                      :processor (comp sink (partial map ->row))
                      :m->d m->d})]
    (io/in! suck in-path)
    (when (seq in-paths)
      (let [sink+ (io/sink (io/tsv {:append true}) out-path)
            suck+ (cells {:dim-count (count order)
                          :processor (comp sink+ (partial map ->row))
                          :m->d m->d})]
        (doseq [in-path in-paths]
          (io/in! suck+ in-path))))))
