(ns toy-app.ml.models
  )




(defn actions_per_day
  "the favorite count & follow count of the user and every active phrase"
  [id]
  (map #(with-meta (fav_fol_per_day id :phrase (:phrase %))
          {:phrase (clojure.string/replace (:phrase %) "\"" "``")})
       (select phrases (where {:user_id id}))))

;; TODO may contain missing day
(defn avg_follows_per_day []
  (let [follows_per_day (->> (active_users)
                             (map :id)
                             (map #(-> % fav_fol_per_day))
                             flatten
                             (filter #(-> % :fav_count (> 0)))
                             (group-by :date)
                             (map (fn [[d list#]]
                                    (let [user_count (count list#)
                                          follow_count (reduce #(+ %1 (:follow_count %2)) 0 list#)]
                                      {:date d
                                       :user_count user_count
                                       :follow_count follow_count
                                       :avg_count (float (/ follow_count user_count))})))
                             (sort-by :date))]
    (with-meta follows_per_day {:title "Daily Average Follows", :fields [:date :user_count :follow_count :avg_count]})))








(defn model-num-for-user [uid]
  (let [total-num (select (favorites uid)
                          (fields :experiment_id)
                          (aggregate (count :tweet_id) :fav_count)
                          (aggregate (count :follows.a_id) :fol_count)
                          (aggregate (max :favorites.at) :last_at)
                          (join :left (follows uid) (= :follows.a_id :twitter_user_id))
                          (where (raw "experiment_id is not null"))
                          (group :experiment_id))]
    (zipmap (map :experiment_id total-num)
            (map #(dissoc % :experiment_id) total-num))))

(defn model-experiments-total-stats []
  (let [fav-fol (->> (active_users)
                     (map :id)
                     (map model-num-for-user)
                     (apply merge-with
                            (fn [x y]
                              (hash-map :fav_count (+ (:fav_count x) (:fav_count y))
                                        :fol_count (+ (:fol_count x) (:fol_count y))
                                        :last_at (max (:last_at x) (:last_at y))))))
        fav-fol-eff (->> fav-fol
                         (map (fn [[experiment_id {:keys [fav_count fol_count] :as whole}]]
                                (let [efficacy (try (float (/ fol_count fav_count))
                                                    (catch Exception _ 0))]
                                  [experiment_id (assoc whole :efficacy efficacy)])))
                         (into {}))]
    fav-fol-eff))

(defn total_fav_follows_per_day []
  (let [list-by-date (->> (active_users)
                          (map :id)
                          (map fav_fol_per_day)
                          flatten
                          (group-by :date)
                          (map (fn [[date list#]]
                                 (let [[fav_cnt# follow_cnt#] (reduce (fn [[fav_cnt follow_cnt] {:keys [fav_count follow_count]}]
                                                                        [(+ fav_cnt fav_count) (+ follow_cnt follow_count)])
                                                                      [0 0] list#)]
                                   {:date date :fav_count fav_cnt# :follow_count follow_cnt#})))
                          (sort-by :date))]
    (with-meta list-by-date {:title "Daily Follows", :fields [:date :fav_count :follow_count]})))

; list the overall stats per day
(defn total_efficacy_per_day []
  (let [efficacy_by_date (->> (total_fav_follows_per_day)
                              (map (fn [{fav_n :fav_count, follow_n :follow_count, :as whole}]
                                     (assoc whole
                                       :efficacy (try (float (/ follow_n fav_n)) (catch Exception _ 0))))))]
    (with-meta efficacy_by_date {:title "Daily Efficacy", :fields [:date :fav_count :follow_count :efficacy]})))

(defn accum_followers_per_day
  ([id] (accum_followers_per_day id (fav_fol_per_day id)))
  ([id stats]
     (when (and stats (seq stats))
       (reductions (fn [x y]
                     {:date (prettify_date (:date y)),
                      :follow_count (+ (:follow_count x) (:follow_count y))})
                   stats))))

(defn fav_fol_last_days
  ([id days_ago] (fav_fol_last_days id days_ago (fav_fol_per_day id)))
  ([id days_ago stats]
     (let [start_date (minus (now) (days days_ago))
           start_date_int (-> start_date to-long comparable_date)
           filtered_stats (filter (fn [x] (>= (:date x) start_date_int)) stats)
           filtered_map (group-by :date filtered_stats)
           dates (map #(-> (plus start_date (days %)) to-long comparable_date) (range (inc days_ago)))
           count_by_date_desc (map #(let [x (filtered_map %)]
                                      (if x (-> x first) {:date % :fav_count 0 :follow_count 0}))
                                   dates)]
       count_by_date_desc)))

(defn summary_stats_for_user
  "total favorite count, follow count & efficacy, calculate using fav_fol_per_day"
  ([user_id] (summary_stats_for_user user_id (fav_fol_per_day user_id)))
  ([user_id stats]
     (let [total_fav (reduce + (map :fav_count stats))
           total_fol (reduce + (map :follow_count stats))
           total_efficacy (try (float (/ total_fol total_fav))
                               (catch Exception _ 0.0))]
       {:fav_count total_fav :follow_count total_fol :efficacy total_efficacy})))

(defn bayes_positive_observations [p user_id]
  (let [favs (select (favorites user_id) (where {:keyword p :user_id user_id}))
        all_friend_ids (into #{} (map #(:a_id %) (select (follows user_id) (where {:b_id user_id}))))
        filtered_favs (filter #(contains? all_friend_ids (:twitter_user_id %) )  favs)
        ]
    (map (fn[x] {:target 1 :data (fav_to_bayes_features user_id x) :id (:id x) }) filtered_favs)
    ))
(defn bayes_negative_observations [p user_id]
  (let [favs (select (favorites user_id) (where {:keyword p :user_id user_id}))
        all_friend_ids (into #{} (map #(:a_id %) (select (follows user_id) (where {:b_id user_id}))))
        filtered_favs (filter #(not (contains? all_friend_ids (:twitter_user_id %) ))  favs)
        ]
    (map (fn[x] {:target 0 :data (fav_to_bayes_features user_id x) :id (:id x) }) filtered_favs)
    ))


(defn bayes_observations [p user_id] (concat (bayes_positive_observations p user_id) (bayes_negative_observations p user_id)))
 
(defn id_positive_observations [p user_id] (map :id (bayes_positive_observations p user_id)))
(defn id_negative_observations [p user_id] (map :id (bayes_positive_observations p user_id)))
(defn id_observations [p user_id] (map :id (bayes_observations p user_id)))

(defn positive_observations [p user_id]
  (let [favs (select (favorites user_id) (where {:keyword p :user_id user_id}))
        all_friend_ids (into #{} (map #(:a_id %) (select (follows user_id) (where {:b_id user_id}))))
        filtered_favs (filter #(contains? all_friend_ids (:twitter_user_id %) )  favs)
        ]
    (map (fn[x] {:target 1 :data (fav_to_features x) }) filtered_favs)
    ))
(defn negative_observations [p user_id]
  (let [favs (select (favorites user_id) (where {:keyword p :user_id user_id}))
        all_friend_ids (into #{} (map #(:a_id %) (select (follows user_id) (where {:b_id user_id}))))
        filtered_favs (filter #(not (contains? all_friend_ids (:twitter_user_id %) ))  favs)
        ]
    (map (fn[x] {:target 0 :data (fav_to_features x) }) filtered_favs)
    ))

(defn writelines [file-path lines]
  (with-open [wtr (clojure.java.io/writer file-path)]
    (doseq [line lines]
      (cond 
        (map? line) (.write wtr (clojure.string/join "," (vals line)))
        (seq? line) (.write wtr (str (clojure.string/join "," line) "\n"))
        :else (.write wtr line)
        ))))

; data format
; ({:y 1, :xs (1.0 16.0 0.0 0.0 0.0 1.0 400.0 682.0 215.0 8.0 95.0 470.0 4.0 0.0 0.0 0.0 250.0 1694.0 357.0 484.0 80.0 10.0 720.0 0.0 0.0)} {:y 1, :xs (1.0 17.0 0.0 0.0 0.0 0.0 1142.0 8.0 919.0 6.0 14.0 44.0 4.0 0.0 1.0 0.0 250.0 1694.0 357.0 484.0 80.0 10.0 720.0 0.0 0.0)} 
(defn write-data-to-file [filename data & header] 
    (writelines filename (doall (map (fn [row] (conj (:xs row) (:y row))) data)))
  )

; returns header and rest of the data
(defn read-data-from-file [filename]
  (let [input (readlines filename)]
     { :header (let [parts (clojure.string/split (first input) #",")] (str (first parts) "," (clojure.string/join "," (drop 4 parts))))
       :data (map (fn [x] (let [parts (clojure.string/split x #",")]
                         {:y (Integer. (first parts)) :xs (map #(Double. %) (drop 4 parts))} ))
                      (rest input))}  ))

;given a phrase, get all the observations
(defn observations [p user_id] (concat (positive_observations p user_id) (negative_observations p user_id)))

; call with e.g. all_observations as input file
(defn system_model_stats
 ([model_builder model_predictor data_file]
   (k-crossfold-validation 
     10
     (partial model_builder)
     (partial model_predictor)
     (:data (read-data-from-file data_file))
     )))

; (model_creation k_train_data)
; returns model name
(defn build-r-random-forest [train_data & params]
  (let [rand_int (rand-int 999)
        - (println "params: " params)
        data_file (str "data." rand_int)
        - (write-data-to-file data_file (:header params))
        model-file (str "model." rand_int)
        ratio 1.5
        command (str "APP_ENV=" environment " /usr/bin/env ruby src/system_model.rb -build -m" model-file " -r" ratio " " data_file)
        return (sh "bash" :in command)]
  return))

;(predictor model (x :xs))
(defn predict-r-random-forest [model test_data]
  (let [data_file (write-data-to-file)
        command (str "APP_ENV=" environment " /usr/bin/env ruby src/system_model.rb -classify -m" model " " data_file)
        return (sh "bash" :in command)]
  return))

;; similar to system_model_stats but takes care of the header
(defn test-r-random-forest [data_file & params]
  (let [data (read-data-from-file data_file)]
    (k-crossfold-validation
     5
     build-r-random-forest
     predict-r-random-forest
     (:data data)
     (assoc params :header (first :data))
     )))

(defn scikit-candidates
  [{:keys [uid candidates filter? feature-function ratio training-file model]
       :or {filter? true
            ratio 1.5
            model "model = RandomForestClassifier(n_estimators = 501, n_jobs = 1)"
            feature-function "ads.ml/fav_to_features_discrete"}}]
  (if (empty? candidates)
    []
    (let [feature-function-resolved (resolve (read-string feature-function))
          processed (map #(feature-function-resolved uid %) candidates)
          tag (pid/current)
          candidates-file (str "/tmp/" uid "_" tag ".candidates")
          pre-hash-str (str ratio model feature-function-resolved)
          md5-prefix (str (md5-hash pre-hash-str))
          model-file (str "/tmp/" md5-prefix ".py.model")
          training-file (if (nil? training-file) (str "/tmp/" md5-prefix "_py_observations.csv") training-file)
          command-train (str "APP_ENV=" environment " /usr/bin/env python src/ml/model.py"
                             " --model \""  model "\" "
                             " --mode train "
                             " --ratio " ratio
                             " --file_model " model-file
                             " --file_train " training-file)
          command-run (str "APP_ENV=" environment " /usr/bin/env python src/ml/model.py"
                           " --mode run "
                           " --file_model " model-file
                           " --ratio " ratio
                           " --file_data " candidates-file)]
      (log/info (str "scikit-candidates, uid: " uid ", command_train: " command-train))
      (log/info (str "scikit-candidates, uid: " uid ", command_run: " command-run))
      (when (not (.exists (File. training-file)))
        (output-all-observations feature-function-resolved training-file))
      (with-open [wrtr (clojure.java.io/writer candidates-file)]
        (.write wrtr (str "label,advertiser_id,tweet_id,candidate_id," (clojure.string/join "," (map name (feature-function-resolved uid (first candidates) true))) "\n"))
        (dorun
         (map
          (fn [p, c]
            (.write wrtr (str "-1," uid "," (:id c)  "," (-> c :user :id) ","  (clojure.string/join "," p) "\n")))
          processed candidates)))
      (let [return (sh "bash" :in command-train)
            _ (let [log_str (str "train rf python: " return)]
                (if (not= (:exit return) 0) (log/error log_str) (log/info log_str)))
            return (sh "bash" :in command-run)
            _ (let [log_str (str "run rf python err: " (:err return))]
                (if (not= (:exit return) 0) (log/error log_str) (log/info log_str)))
            results (map #(Integer. %) (-> return :out (clojure.string/split #"\n")))
            _ (log/info (str uid " python model filter " (reduce + results) " out of " (count results)))
            ]
        (if filter?
          (map first (filter #(= (last %) 1) (map vector candidates results)))
          (map vector candidates results))))))



