(ns witan.phyrexian.dclg-ingester
  (:require [clojure.string :as str]
            [clojure.set :as st]
            [schema.core :as s]
            [schema.utils :as su]
            [witan.phyrexian.utils :as u]
            [witan.phyrexian.gss-harmonizer :as gss]))


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; DCLG households data                          ;;
;;                                               ;;
;; Manual step before processing 2014 data:      ;;
;;   - Save sheet in Excel workbook as csv       ;;
;;                                               ;;
;; Numeric data without nils; 2014 = large files ;;
;; 2012 & 2014 versions have different columns   ;;
;; Household model uses 2014 version             ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(def HRRInputSchema
  {:gss-code s/Str
   :age-group s/Str
   :sex (s/enum "M" "F")
   :relationship s/Str
   :year s/Int
   :hh-repr-rates (s/maybe double)})

(def HRROutputCols [:gss-code :age-group :sex
                    :year :relationship :hh-repr-rates])

(def HHpopnInputSchema
  {:gss-code s/Str
   :age-group s/Str
   :sex (s/enum "M" "F")
   :relationship s/Str
   :year s/Int
   :dclg-household-popn (s/maybe double)})

(def HHpopnOutputCols
  [:gss-code :age-group :sex
   :year :relationship :dclg-household-popn])

(def InstPopnInputSchema
  {:gss-code s/Str
   :age-group s/Str
   :sex (s/enum "M" "F")
   :relationship s/Str
   :year s/Int
   :dclg-institutional-popn (s/maybe double)})

(def InstPopnOutputCols
  [:gss-code :age-group :sex
   :year :relationship :dclg-institutional-popn])

(def scrub-data-2012
  (comp
   (remove u/age-group-equals-total?)
   (map #(st/rename-keys % {:ONScode :gss-code
                            :Age :age-group
                            :Gender :sex
                            :Relationship :relationship}))))

(def scrub-data-2014
  (comp
   (remove u/age-group-equals-total?)
   (map #(st/rename-keys % {:ONScode :gss-code
                            :Age :age-group
                            :Sex :sex
                            :Relationship :relationship}))
   (map #(dissoc % :Region))))

(defn format-dclg-data [schema-in val-keyname]
  (comp
   (mapcat #(u/gather-by-year
             [:gss-code :age-group :sex :relationship] % val-keyname))
   (map #(u/schema-coercion schema-in %))
   (remove su/error?)
   (remove u/year-before-2011?)))

(defn process-dclg-data-2012
  [filename schema-in val-keyname]
  (let [csv (u/load-csv filename)
        xf (comp scrub-data-2012 (format-dclg-data schema-in val-keyname))
        harmonized (gss/harmonize [:gss-code :year :sex :relationship :age-group]
                                  [val-keyname] (into [] xf csv))]    
    (sort-by (juxt :gss-code :year :sex :relationship :age-group) harmonized)))

(defn process-dclg-data-2014
  [filename schema-in val-keyname]
  (let [csv (u/load-csv filename)
        xf (comp scrub-data-2014 (format-dclg-data schema-in val-keyname))
        harmonized (gss/harmonize [:gss-code :year :sex :relationship :age-group]
                                  [val-keyname] (into [] xf csv))]
    (sort-by (juxt :gss-code :year :sex :relationship :age-group) harmonized)))

(defn ingest-dclg-2012
  "Scrub and format 2012 DCLG data- this data is no longer being used in 
  the household projections because more recent data is available."
  []
  (let [hrr-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_2012_hh_repr_rates_darlington.csv"
        hrr-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_2012_hh_repr_rates_darlington.csv"
        hhpop-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_2012_hh_popn_proj_darlington.csv"
        hhpop-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_2012_hh_popn_proj_darlington.csv"
        instpop-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_2012_inst_popn_proj_darlington.csv"
        instpop-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_2012_inst_popn_proj_darlington.csv"
        hrr-2012 (process-dclg-data-2012 hrr-in
                                         HRRInputSchema
                                         :hh-repr-rates)
        hh-popn-2012 (process-dclg-data-2012 hhpop-in
                                             HHpopnInputSchema
                                             :dclg-household-popn)
        inst-popn-2012 (process-dclg-data-2012 instpop-in
                                               InstPopnInputSchema
                                               :dclg-institutional-popn)]
    (u/write-ordered-csv hrr-out hrr-2012 HRROutputCols)
    (u/write-ordered-csv hhpop-out hh-popn-2012 HHpopnOutputCols)
    (u/write-ordered-csv instpop-out inst-popn-2012 InstPopnOutputCols)))

(defn ingest-dclg-2014 
  "Scrub and format 2014 DCLG data for loading into household projection
  model."
  []
  (let [hrr-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_2014_hh_repr_rates.csv"
        hrr-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_2014_hh_repr_rates.csv"
        hhpop-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_2014_hh_popn_proj.csv"
        hhpop-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_2014_hh_popn_proj.csv"
        instpop-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_2014_inst_popn_proj.csv"
        instpop-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_2014_inst_popn_proj.csv"
        hrr-2014 (process-dclg-data-2014 hrr-in
                                         HRRInputSchema
                                         :hh-repr-rates)
        hh-popn-2014 (process-dclg-data-2014 hhpop-in                    
                                             HHpopnInputSchema
                                             :dclg-household-popn)            
        inst-popn-2014 (process-dclg-data-2014 instpop-in                
                                               InstPopnInputSchema
                                               :dclg-institutional-popn)]
    (u/write-ordered-csv hrr-out hrr-2014 HRROutputCols)
    (u/write-ordered-csv hhpop-out hh-popn-2014 HHpopnOutputCols)
    (u/write-ordered-csv instpop-out inst-popn-2014 InstPopnOutputCols)))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; DCLG dwellings data                                      ;;
;;                                                          ;;
;; Needs manual processing first to make machine readable:  ;;
;;   - Delete blank lines, footnotes, superscripts          ;;
;;   - Delete parent geography rows                         ;;
;;   - Delete rows with totals for England, London          ;;
;;   - Remove spaces in column names (& superscripts)       ;;
;;   - Save as csv file; make sure no commas in numbers     ;; 
;;                                                          ;;
;; Manually processed data contains blanks/missing data:    ;;
;;   - Code below converts these values nil                 ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(def DclgDwellingsSchema
  {:gss-code s/Str
   :year s/Int
   :dwellings (s/maybe s/Int)})

(def DclgVacantDwellingsSchema
  {:gss-code s/Str
   :year s/Int
   :vacancy-dwellings (s/maybe s/Int)})

(def scrub-dwellings-data
  (comp
   (map #(st/rename-keys % {:NewONScode :gss-code}))
   (map (fn [m]
          (into {}
                (map (fn [[k v]]
                       [k (when-not (or (= ".." v) (= "" v)) v)]) m))))
   (remove u/missing-gss-code?)))

(defn format-dwellings-data [schema val-keyname]
  (comp
   (mapcat #(u/gather-by-year [:gss-code] % val-keyname))
   (map #(u/schema-coercion schema %))
   (remove su/error?)
   (remove u/year-before-2011?)))

(defn process-dclg-data-dwellings
  [filename schema val-keyname]
  (let [csv (u/load-csv filename)
        xf (comp scrub-dwellings-data (format-dwellings-data schema val-keyname))
        harmonized (gss/harmonize [:gss-code :year] [val-keyname] (into [] xf csv))]
    (sort-by (juxt :gss-code :year) harmonized)))

(defn ingest-dclg-dwellings 
  "Scrub and format DCLG dwellings data for loading into household projection
  model. Only keeps data for 2015. Note that some local authorities do not have data
  for 2015; in this case the value from the most recent year with data is used."
  []
  (let [dwel-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_dwellings_cleaned.csv"
        dwel-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_dwellings_2015.csv"
        vdwel-in "data/data_to_ingest/dclg_household_model_inputs/data_inputs_raw/dclg_vacant_dwellings_cleaned.csv"
        vdwel-out "data/data_to_ingest/dclg_household_model_inputs/data_inputs_prepped/dclg_vacant_dwellings_2015.csv"
        dwellings (process-dclg-data-dwellings dwel-in DclgDwellingsSchema :dwellings)
        vac-dwellings (process-dclg-data-dwellings vdwel-in DclgVacantDwellingsSchema :vacancy-dwellings)]
    (u/write-csv dwel-out dwellings)
    (u/write-csv vdwel-out vac-dwellings)))

(defn -main []
  (ingest-dclg-2012) ;;no longer used in household model
  (ingest-dclg-2014) ;;slow- large files
  (ingest-dclg-dwellings)) ;;nils in data
