; Copyright (c) Sławek Gwizdowski
;
; Permission is hereby granted, free of charge, to any person obtaining
; a copy of this software and associated documentation files (the "Software"),
; to deal in the Software without restriction, including without limitation
; the rights to use, copy, modify, merge, publish, distribute, sublicense,
; and/or sell copies of the Software, and to permit persons to whom the
; Software is furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included
; in all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
; OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
; IN THE SOFTWARE.
;
(ns ^{:author "Sławek Gwizdowski"
      :doc "Essbase BSO export.

* Space separated.
* Quoted member names, non-quoted values.
* Max 2GB size.
* COLUMNS are specified in first line of the file:
  - List of quoted member names of single dense dimension, last field is empty.
  - No members from this dimension ever appear in the file again.
  - N members are specified, up to these many figures can appear in data lines.
* POV lines appear periodically, those signal complete POV update:
  - List of quoted member names from distinct sparse dimensions.
* DATA line consists of both partial POV updates and figures:
  - Quoted members of remaining dense dimensions (not present in POV lines).
  - Figures are non-quoted #Mi and numeric values, up to N occurrences per line.
  - Last field is always empty string, so last figure is followed by space.
  - Missing values from the left are marked as #Mi, from the right - skipped.

To parse the export file you need to know two basic things:

* Number of data storing dimensions in the cube.
* Complete mapping of member name to dimension name in those dimensions.
"}
 szew.essbase.bso
  (:gen-class)
  (:require [szew.io :as io]
            [clojure.java.io :as clj.io :refer [reader]]
            [clojure-csv.core :as csv])
  (:import [java.io BufferedReader]))

(defn data-field?
  "Take field, see if it's a possible Essbase value (empty, #Mi or number).
  "
  [field]
  (or (contains? #{"" "#Mi"} field)
      (try
        (Double/parseDouble field)
        true
        (catch NumberFormatException _ false))))

(defn data-line?
  "Check if last field is empty and penultimate field is a data-field.
  "
  [row]
  (and (= "" (last row)) (data-field? (last (pop row)))))

(defrecord Members [processor dim-count encoding]
  io/Input
  (io/in! [spec source]
    (when (nil? dim-count)
      (throw (ex-info "Set dim-count first!" {:members spec})))
    (letfn [(rip [block]
              (let [pov  (first (first block))
                    data (second block)
                    nip  (- (dec dim-count) (count pov))
                    nipr (comp vec (partial take nip))]
                (into (vec pov) (mapcat nipr data))))
            (splice [rows]
              (let [columns (-> rows first vec pop)
                    blocks  (->> rows
                                 rest
                                 (partition-by data-line?)
                                 (partition 2 2 nil))]
                (concat columns (mapcat rip blocks))))]
      (io! "Reading files here!"
           (with-open [^BufferedReader r (reader source :encoding encoding)]
             (processor (splice (csv/parse-csv r :delimiter \space))))))))

(defn members
  "Processor gets member name per every occurrence in file.

  Default processor creates a hash-set of member names.

  It will discard all the figures, provided dim-count is correct, and only
  process members.
  "
  ([]
   (Members. (partial into (hash-set)) nil "UTF-8"))
  ([spec]
   (into (members) spec)))

(defrecord Records [processor m->d dim-count encoding]
  io/Input
  (io/in! [spec source]
    (when (nil? dim-count)
      (throw (ex-info "Set dim-count first!" {:members spec})))
    (letfn [(ok? [record]
              (if (contains? (first record) nil)
                (-> (format "Unknown dimension of: %s" (get (first record) nil))
                    (ex-info {:record    record
                              :pov-line  (-> record meta :pov-line)
                              :data-line (-> record meta :data-line)})
                    (throw))
                record))
            (rip [columns block]
              (let [pov-line (first (first (first block)))
                    pov      (second (first (first block)))
                    datas    (second block)
                    nip      (- (dec dim-count) (count pov))
                    nipr     (comp vec (partial take nip))
                    tuckr    (comp vec (partial drop nip))]
                (for [data datas]
                  (with-meta [(->> (concat pov (nipr (second data)))
                                   (mapv (juxt m->d identity))
                                   (into (hash-map)))
                              (->> (butlast (tuckr (second data)))
                                   (mapv vector columns)
                                   (into (hash-map)))]
                             {:pov-line  pov-line
                              :data-line (first data)
                              :source    source
                              :pov       pov
                              :data      (second data)}))))
            (splice [rows]
              (let [columns (-> rows first vec pop)
                    blocks  (->> rows
                                 (map vector (range))
                                 rest
                                 (partition-by (comp data-line? second))
                                 (partition 2 2 nil))]
                (map ok? (mapcat rip (repeat columns) blocks))))]
      (io! "Reading files here!"
           (with-open [^BufferedReader r (reader source :encoding encoding)]
             (processor (splice (csv/parse-csv r :delimiter \space))))))))

(defn records
  "Processor will get data point per each data-line in file.

  Requires complete member to dimension mapping in m->d and correct data storing
  dimension count in dim-count.

  Processor gets seq of [{dimension member} {column value}].

  Default processor will return a vector of such data points.

  Errors out with ex-info if any member is mapped to nil.
  "
  ([]
   (Records. vec {} nil "UTF-8"))
  ([spec]
   (into (records) spec)))

(defrecord Cells [processor m->d dim-count encoding]
  io/Input
  (io/in! [spec source]
    (letfn [(split [record]
              (for [[column value] (second record)]
                (with-meta [(assoc (first record) (m->d column) column) value]
                  (meta record))))
            (splitter [records]
              (processor (mapcat split records)))]
      (io/in! (records (assoc spec :processor splitter)) source))))

(defn cells
  "Processor will get data point per each cell in file.

  Requires complete member to dimension mapping in m->d and correct data storing
  dimension count in dim-count.

  Processor gets seq of [{dimension member} value].

  Default processor will return a vector of cells.

  Errors out with ex-info if any member is mapped to nil.
  "
  ([]
   (Cells. vec {} nil "UTF-8"))
  ([spec]
   (into (cells) spec)))

;; Helper functions -- Members

(defn sniff-unknown
  "Creates a processor for Members that will return a seq of distinct missing
  members.
  "
  [m->d]
  (fn distinct-unknown [member]
    (distinct (filterv (comp nil? m->d) member))))

;; Helper functions -- Cells

(defn sniff-dimensions
  "Creates a processor for Cells that will return dimensions of first row.
  "
  [cells]
  (into (hash-set) (keys (first (first cells)))))

(defn dump->tsv
  "Given dump files consolidates them into single, row-expanded TSV.

  One row per cell.

  Requires complete member to dimension mapping in m->d and complete list of
  dimension names in order. It will be used to get dim-count right.
  "
  [m->d order out-path in-path & in-paths]
  (let [->line (fn [cell] (conj (mapv (first cell) order) (last cell)))
        sink   (io/sink (io/tsv) out-path)
        suck   (cells {:dim-count (count order)
                       :processor (comp sink (partial map ->line))
                       :m->d m->d})]
    (io/in! suck in-path)
    (when (seq in-paths)
      (let [sink+ (io/sink (io/tsv {:append true}) out-path)
            suck+ (cells {:dim-count (count order)
                          :processor (comp sink+ (partial map ->line))
                          :m->d m->d})]
        (doseq [in-path in-paths]
          (io/in! suck+ in-path))))))
