(ns antistock.db.features
  (:refer-clojure :exclude [distinct group-by update])
  (:require [antistock.db.quotes :as quotes]
            [antistock.json :as json]
            [clj-time.coerce :refer [to-date-time to-sql-time]]
            [clojure.data.csv :as csv]
            [clojure.java.io :as io]
            [clojure.string :as str]
            [datumbazo.core :refer :all :exclude [columns]]))

(def feature-keywords
  [:adj-close
   :close
   :high
   :low
   :open
   :tgeo
   :thtg
   :tid
   :turl
   :tusm
   :uflw
   :ufrn
   :uid
   :volume])

(defn- constrain-subgraph
  "Restrict `stmt` on the subgraph of `quotes` between `start` and `end`."
  [& {:keys [quotes start end]}]
  (if (or (not (empty? quotes)) start end)
    (where `(and ~@(->> [(if start `(>= :twitter.tweets.created-at ~(to-date-time start)))
                         (if end `(< :twitter.tweets.created-at ~(to-date-time end)))
                         (if-not (empty? quotes)
                           `(in :quote-id ~(map :id quotes)))]
                        (remove nil?))))))

(defn- group-order-by-quote-id-date
  "Return a fn that groups by quote-id and date."
  []
  (chain-state
   [(group-by :quote-id '(date :twitter.tweets.created-at))
    (order-by :quote-id '(date :twitter.tweets.created-at))]))

(defn number-of-retweets
  "Return the RTID feature (number of retweets in G)."
  [db & [{:keys [quotes start end]}]]
  ;; TODO:
  (assert false))

(defn number-distinct-users-with-retweets
  "Return the RTU (number of different users that have re-tweeted in G)."
  [db & [{:keys [quotes start end]}]]
  ;; TODO:
  (assert false))

(defn number-of-tweets-with-geolocation
  "Return the TGEO feature (number of tweets with geo location in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(count *) :tgeo)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (where `(and (is-not-null :location)
                 (~(keyword "!~") :location "^[[:space:]]*$"))
           :and)
    (group-order-by-quote-id-date)))

(defn number-of-tweets
  "Return the TID feature (number of tweets in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(count *) :tid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn number-of-tweets-mentioning-user
  "Return the TUSM feature (number of tweets that mention any user in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(count *) :tusm)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn avg-number-of-friends
  "Return the UFRN feature (average number of friends for user that posted in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(avg :friends-count) :ufrn)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn number-of-hash-tags-in-tweets
  "Return the THTG feature (number of hash tags used in all the tweets in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(count *) :thtg)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.hash-tags-tweets.tweet-id :twitter.tweets-quotes.tweet-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn number-of-tweets-with-urls
  "Return the TURL feature (number of tweets with urls in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(count *) :turl)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.links-tweets.tweet-id :twitter.tweets-quotes.tweet-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn avg-number-of-followers-for-users
  "Return the UFLW feature (average number of followers for user that posted in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at) (as '(avg :followers-count) :uflw)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn number-of-distinct-users
  "Return the UID feature (number of different users that posted in G)."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id '(date :twitter.tweets.created-at)
              (as '(count distinct :twitter.tweets.user-id) :uid)]
    (from :twitter.tweets)
    (join :twitter.tweets-quotes.tweet-id :twitter.tweets.id)
    (join :twitter.users.id :twitter.tweets.user-id)
    (constrain-subgraph :quotes quotes :start start :end end)
    (group-order-by-quote-id-date)))

(defn prices
  "Return the prices for `quotes` between `start` and `end`."
  [db & [{:keys [quotes start end]}]]
  (select db [:quote-id :date :open :close :high :low :volume :adj-close]
    (from :prices)
    (if (or (not (empty? quotes)) start end)
      (where `(and ~@(->> [(if start `(>= :date ~(to-date-time start)))
                           (if end `(< :date ~(to-date-time end)))
                           (if-not (empty? quotes)
                             `(in :quote-id ~(map :id quotes)))]
                          (remove nil?)))))))

(defn date-series
  "Generate a date series from `start` to `end`."
  [db start end]
  (select db [(as '(cast (date-trunc "day" :time) :date) :date)]
    (from (as `(generate-series
                (cast ~(to-sql-time start) :timestamp)
                (cast ~(to-sql-time end) :timestamp)
                (cast "1 day" :interval))
              :time))))

(defn- join-features [feature-1 feature-2]
  (join feature-2
        `(on (and (= ~(keyword (str (name feature-1) ".quote-id"))
                     ~(keyword (str (name feature-2) ".quote-id")))
                  (= ~(keyword (str (name feature-1) ".date"))
                     ~(keyword (str (name feature-2) ".date")))))
        :type :left))

(defn feature-alias [day feature]
  (keyword (str (last (str/split (name feature) #"\.")) "-" day)))

(defn- feature-window [window days features]
  (for [day (range 0 days)
        feature features]
    (as `(over (lag ~feature ~day) ~window)
        (feature-alias day feature))))

(defn features
  "Return machine learning features."
  [db & [{:keys [days quotes start end] :as opts}]]
  (with db [:prices (prices db opts)
            :tgeo (number-of-tweets-with-geolocation db opts)
            :thtg (number-of-hash-tags-in-tweets db opts)
            :tid (number-of-tweets db opts)
            :turl (number-of-tweets-with-urls db opts)
            :tusm (number-of-tweets-mentioning-user db opts)
            :uflw (avg-number-of-followers-for-users db opts)
            :ufrn (avg-number-of-friends db opts)
            :uid (number-of-distinct-users db opts)]
    (select db (concat
                [:prices.quote-id
                 (as '(to-char :prices.date "YYYY-MM-DD") :date)]
                (feature-window :w (or days 1) feature-keywords))
      (from :prices)
      (join-features :prices :tgeo)
      (join-features :prices :thtg)
      (join-features :prices :tid)
      (join-features :prices :turl)
      (join-features :prices :tusm)
      (join-features :prices :uflw)
      (join-features :prices :ufrn)
      (join-features :prices :uid)
      (window (as '(partition-by
                    :prices.quote-id
                    (order-by :prices.date)) :w))
      (order-by :prices.quote-id :prices.date))))

(defn column-keywords [days]
  (concat
   [:quote-id :date]
   (sort (for [day (range 0 days)
               feature feature-keywords]
           (keyword (str (name feature) "-" day))))))

(defn- save-csv-features
  "Save the rows in `result-set` in JSON format to `filename`."
  [result-set filename & [opts]]
  (let [columns (:columns opts)]
    (assert (not-empty columns) "Can't save rows in CSV format without columns.")
    (with-open [out (io/writer filename)]
      (->> (map #(map % columns) result-set)
           (concat [(map name columns)])
           (csv/write-csv out)))))

(defn- save-edn-features
  "Save the rows in `result-set` in EDN format to `filename`."
  [result-set filename & [opts]]
  (let [columns (:columns opts)]
    (with-open [out (java.io.PrintWriter. (io/writer filename))]
      (doseq [row result-set]
        (.println out (pr-str row))))))

(defn- save-json-features
  "Save the rows in `result-set` in JSON format to `filename`."
  [result-set filename & [opts]]
  (let [columns (:columns opts)]
    (with-open [out (java.io.PrintWriter. (io/writer filename))]
      (doseq [row result-set]
        (.println out (json/json-str row))))))

(defn save-features
  "Select features from `db` and write them to `filename`"
  [db filename & [{:keys [format days quotes start end] :as opts}]]
  (let [opts (update-in opts [:days] #(or % 1))
        opts (assoc opts :columns (column-keywords (:days opts)))
        rows (-> (run (features db opts)
                   {:fetch-size 1000
                    :result-set-fn
                    (fn [result-set]
                      ((case format
                         :csv save-csv-features
                         :edn save-edn-features
                         :json save-json-features)
                       result-set filename opts)
                      [{:count (count result-set)}])})
                 first :count)]
    {:bytes (.length (io/file filename))
     :filename filename
     :rows rows}))
