;; Copyright (c) Sławek Gwizdowski
;;
;; Permission is hereby granted, free of charge, to any person obtaining
;; a copy of this software and associated documentation files (the "Software"),
;; to deal in the Software without restriction, including without limitation
;; the rights to use, copy, modify, merge, publish, distribute, sublicense,
;; and/or sell copies of the Software, and to permit persons to whom the
;; Software is furnished to do so, subject to the following conditions:
;;
;; The above copyright notice and this permission notice shall be included
;; in all copies or substantial portions of the Software.
;;
;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
;; OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
;; IN THE SOFTWARE.
;;
(ns ^{:author "Sławek 'smg' Gwizdowski"
      :doc "File tree indexer."}
  szew.fi
  (:gen-class)
  (:require [hugsql.core :as hugsql]
            [clojure.java.jdbc :as jdbc]
            [clojure.java.io :as clj.io]
            [clojure.string :as string]
            [clojure.tools.logging :as log]
            [hiccup.page :refer [html5]]
            [szew [io :as io]
             [h2 :as h2]]
            [szew.io.util :as util]
            [szew.fi.diff :as diff])
  (:import [java.io File FileNotFoundException]
           [java.nio.file Files]
           [java.util Date]))

;; Index creation, indexing, selectors defined in raw SQL.
;; hugsql will look for this file in resources/
(hugsql/def-db-fns "fi.sql")


;; ## Do we want it? (Setting bar really low, but Java7 required.)

(let [empty-opts (into-array java.nio.file.LinkOption [])]
  (defn of-interest?
    "a-file -> bool

    Simplest criteria: is it a file, can we read it.
    "
    [^java.io.File f]
    (let [path (.toPath f)]
      (and (Files/isRegularFile path empty-opts)
           (Files/isReadable path)))))


;; ## Reading the filesystem

(defrecord Entry [tag top-path hash-bytes store-up-to]
  io/Input
  (in! [spec source]
    (io! "Happily mutable."
         (let [a-file ^java.io.File (-> source
                                        (clj.io/as-file)
                                        (.getCanonicalPath)
                                        (clj.io/as-file))
               parent (-> a-file (.getParentFile)  (.getCanonicalPath))
               size   (.length a-file)
               path   (.getCanonicalPath a-file)]
           (try
             (assert tag "tag is required!")
             (assert top-path "top-path is required!")
             (assert (.contains path top-path) "top-path must be in path")
             (assert (or (contains? #{:skip :full} hash-bytes)
                         (pos? hash-bytes))
                     "hash-bytes must be a positive num, :skip or :full!")
             (assert (or (contains? #{:skip :full} store-up-to)
                         (pos? store-up-to))
                     "store-up-to must be a positive num, :skip or :full!")
             {:tag        tag
              :short_hash (when (not= :skip hash-bytes)
                            (io/in! (io/hasher {:hash-name "SHA1"
                                                :sample-size hash-bytes})
                                    a-file))
              :size       size
              :hashed_b   (cond
                            (= :skip hash-bytes) 0
                            (= :full hash-bytes) size
                            (> hash-bytes size) size
                            :else hash-bytes)
              :stored_b   (cond
                            (= :full store-up-to) size
                            (or (= :skip store-up-to)
                                (< store-up-to size)) 0
                            (>= store-up-to size) size
                            :else store-up-to)
              :write_time (Date. (.lastModified a-file))
              :path       path
              :name       (.getName a-file)
              :parent     parent
              :content    (when (and (not= :skip store-up-to)
                                     (or (= :full store-up-to)
                                         (>= store-up-to size)))
                            (slurp a-file))
              :top_path   top-path
              :rel_path   (string/replace path top-path "")
              :rel_parent (string/replace parent top-path "")}
             (catch FileNotFoundException _
               nil)
             (catch AssertionError ae
               (throw (ex-info "Argument error." {:spec spec :path path} ae)))
             (catch Exception ex
               (throw (ex-info "Entry io error." {:spec spec :path path} ex))))))))

(defn entry
  "Entry be like:

    {:tag String
     :top-path String | File
     :hash-bytes positive integer | :full | :skip
     :store-up-to positive integer | :full | :skip }

  The top-path will be removed from CanonicalPath of file and field calculated
  this way will be used for tag vs tag comparison in the db (relative path).

  Initially this is only planned for small (<16KiB) files so we are also trying
  to capture contents of said files (as text!) and SHA1 to that size limit.
  You can parametrize both settings separately, but please be vary of using
  :full on uncertain inputs. It really works best for wee files.
  "
  ([]
   (map->Entry {:hash-bytes 16384 :store-up-to 16384}))
  ([spec]
   {:post [(or (nil? (:tag %)) (string? (:tag %)))
           (or (nil? (:top-path %)) (string? (:top-path %)))
           (or (= :full (:hash-bytes %))
               (= :skip (:hash-bytes %))
               (and (number? (:hash-bytes %))
                    (pos? (:hash-bytes %))))
           (or (= :full (:store-up-to %))
               (= :skip (:store-up-to %))
               (and (number? (:store-up-to %))
                    (pos? (:store-up-to %))))]}
   (into (entry) spec)))


;; ## Persisting index

(defrecord Index [part-size processor]
  io/Input
  (in! [spec db]
    (processor (list-index db {})))
  io/Output
  (sink [spec db]
    (assert (pos? part-size) "Index requires positive num as part-size.")
    (fn persistor-fn [a-seq]
      (let [parts   (->> a-seq
                         (filter (complement nil?))
                         (partition part-size part-size nil))
            row     (partial util/de-recordify [:tag :short_hash :size
                                                :hashed_b :stored_b :write_time
                                                :path :name :parent :content
                                                :top_path :rel_path
                                                :rel_parent])
            insert! (comp #(Long/valueOf ^Long %)
                          (partial insert-entries<! db)
                          (partial assoc {} :entries)
                          (partial mapv row))
            ]
        (io! "Database mutation like a bawss!"
             (loop [total (Long/valueOf 0) chunks parts]
               (if-not (seq chunks)
                 total
                 (recur (+ (insert! (first chunks)) total)
                        (rest chunks)))))))))

(defn index
  "Index be like:

    {:part-size positive integer
     :processor some-seq-eating-fn}
  "
  ([]
   (map->Index {:part-size 128 :processor count}))
  ([spec]
   {:post [(pos? (:part-size %))]}
   (into (index) spec)))


;; ## Processing, file system to database

(defrecord Harvest [tag
                    pre-select-fn
                    select-fn
                    hash-bytes
                    store-up-to
                    part-size
                    processor]
  io/Input
  (in! [spec db]
    (processor (list-index-by-tag db {:tag tag})))
  io/Output
  (sink [spec db]
    (assert tag "Harvest requires tag.")
    (assert db "Harvest requires db.")
    (fn harvester-fn [root]
      (let [abs-path (.getCanonicalPath (clj.io/as-file root))
            e-fn     (partial io/in! (entry {:tag tag
                                             :top-path abs-path
                                             :hash-bytes hash-bytes
                                             :store-up-to store-up-to}))
            i-fn    (io/sink (index {:part-size part-size}) db)
            pipe    (comp i-fn
                          (partial filter (complement nil?))
                          (partial map e-fn)
                          (partial filter select-fn))]
        (io/in! (io/files {:processor pipe :follow? pre-select-fn}) root)))))

(defn harvest
  "Harvest be like:

    {:tag String
     :pre-select-fn (fn java.io.File) -> bool
     :select-fn (fn java.io.File) -> bool
     :hash-bytes positive integer | :full | :skip
     :store-up-to positive integer | :full | :skip
     :part-size positive integer}
  "
  ([]
   (map->Harvest {:pre-select-fn (constantly true)
                  :select-fn of-interest?
                  :hash-bytes 16384
                  :store-up-to 16384
                  :part-size 128
                  :processor count}))
  ([spec]
   {:post [(or (nil? (:tag %)) (string? (:tag %)))
           (or (nil? (:top-path %)) (string? (:top-path %)))
           (or (= :full (:hash-bytes %))
               (= :skip (:hash-bytes %))
               (and (number? (:hash-bytes %))
                    (pos? (:hash-bytes %))))
           (or (= :full (:store-up-to %))
               (= :skip (:store-up-to %))
               (and (number? (:store-up-to %))
                    (pos? (:store-up-to %))))
           (pos? (:part-size %))]}
   (into (harvest) spec)))


;; ## Reading back from database

(defn fetch-entry
  "Get db connection, tag and path/rel_path, return entry with content
  split into vector of lines (if content captured, otherwise vector of single,
  empty string).
  "
  [db tag path]
  (jdbc/with-db-transaction [tx db]
    (when-let [entry (entry-by-tag-path tx {:tag tag :path path})]
      (if (nil? (:content entry))
        (assoc entry :content [""])
        (-> entry
            (h2/de-clob [:content])
            (update-in [:content] (fn [c] (string/split (str c) #"\r?\n"))))))))

(defn entry-diff
  "Compare contents of a file between tags: changed lines, delta and unidiff
  based patch. {:changed x :deltas x :patch x}

  * db connection, source-tag and target-tag are used to lookup entry
  of rel-path. Tag + path combo used for lookup.
  "
  [db source-tag target-tag rel-path]
  (let [source   (fetch-entry db source-tag rel-path)
        target   (fetch-entry db target-tag rel-path)
        lines    (filterv (comp (partial not= :equal) :tag)
                          (diff/line-maps (:content source) (:content target)))
        deltas   (diff/delta-maps (:content source) (:content target))
        a-patche (diff/->Patch (:content source) (:content target))
        fmt      (comp (partial apply (partial format "%s\t''%s''\t%s@%s"))
                       (juxt :rel_path :write_time :tag :fetched_at))
        a-diffie (diff/unidiff (fmt source)
                               (fmt target)
                               (:content source)
                               a-patche
                               1)]
    {:changed lines
     :deltas  deltas
     :patch   (string/join "\r\n" a-diffie)}))

(defn tag-match
  "Returns a hash-map of rel_path := tag-v-tag summary.

  * db is a database connection
  * source-tag and target-tag to generate the differences.
  "
  [db source-tag target-tag]
  (->> (tag-v-tag db {:tag1 source-tag :tag2 target-tag})
       (mapv (juxt :rel_path identity))
       (into (hash-map))))

(defn tag-match-manifest
  "Generate a manifest of all files from source and target tags, with last
  modification date for entries.

  * tag-diffs is a result of tag-diff
  * advisor is a function of tag-diffs entry, that will suggest action.
  "
  [tag-matches]
  (->> (vals tag-matches)
       (mapv (juxt :rel_path
                   :presence
                   (comp str :source_time)
                   (comp str :target_time)
                   (comp str #(= (:source_hash %) (:target_hash %)))
                   (comp str :source_hash)
                   (comp str :target_hash)))
       (sort-by first)
       (cons ["Relative Path" "Present In" "Source TS" "Target TS"
              "Hash Match?" "Source Hash" "Target Hash"])))

(defn tag-diff
  "Returns a hash-map of rel_path := difference summary.

  * db is a database connection
  * source-tag and target-tag to generate the differences.
  "
  [db source-tag target-tag]
  (let [differ (partial entry-diff db source-tag target-tag)]
    (->> (tag-v-tag db {:tag1 source-tag :tag2 target-tag})
         (mapv (comp (partial apply merge)
                     (juxt identity (comp differ :rel_path))))
         (mapv (juxt :rel_path identity))
         (into (hash-map)))))

(defn tag-diff-manifest
  "Generate a manifest of all files from source and target tags, with last
  modification date for entries.

  * tag-diffs is a result of tag-diff
  * advisor is a function of tag-diffs entry, that will suggest action.
  "
  [tag-diffs advisor]
  (->> (vals tag-diffs)
       (mapv (juxt :rel_path
                   :presence
                   (comp str not empty? :patch)
                   advisor
                   (comp str :source_time)
                   (comp str :target_time)))
       (sort-by first)
       (cons ["Relative Path" "Present In" "Diff?" "Advisory"
              "Source TS" "Target TS"])))

(defn super-good-advice
  "Get tags and entry, give Super Good Advice(TM).
  "
  [entry]
  (cond (and (= "BOTH" (:presence entry)) (empty? (:changed entry)))
        "IGNORE (Reason: No diff)"
        (= (:target_tag entry) (:presence entry))
        "INVESTIGATE (Reason: Only in target)"
        (= (:source_tag entry) (:presence entry))
        "DELETE (Reason: Not in target)"
        :else
        "INVESTIGATE (Reason: Diff in content)"))


;; ## And now some abstraction, that shall not be tested, lots of mutability.

(defprotocol IndicesAndReports
  (seed! [spec] "Create tables and indexes.")
  (sync! [spec] "Dump in-memory database into GZip file.")
  (sanitize! [spec] "Delete the GZip file.")
  (erase! [spec] "Drop tables and indexes.")
  (harvest! [spec tag] "Harvests tag, returns a future.")
  (harvest-all! [spec] "Clear and run all harvests, returns a future.")
  (summary! [spec] "Return summary from spec.")
  (write-matches! [spec definition]"Get [tag1 tag2], write matches.tsv.")
  (write-manifest! [spec definition advisor] [spec definition]
                   "Get [tag1 tag2], write manifest.tsv")
  (write-html-diff! [spec definition] "Get [tag1 tag2], write diff.html")
  (write-all-matches! [spec] "Execute all matches.")
  (write-all-manifests! [spec advisor] [spec] "Execute all manifests.")
  (write-all-html-diffs! [spec] "Execute all diffs.")
  (write-all! [spec advisor] [spec] "Do everything. EVERYTHING."))

;; ## We're soooo imperative, boohoo. And it gets worse as we go!

(defn kickstart!
  "Check if exists or create in memory H2 instance."
  [db-path]
  (try (h2/conn! db-path)
       (catch Exception _
         (h2/make! (h2/spec db-path {:method :mem})
                   db-path))))

;; ## Showdown at House of...

(defrecord BlueLeaves [proto-harvest
                       db-path
                       gz-path
                       styles
                       tags-paths
                       matches
                       manifests
                       html-diffs]

  IndicesAndReports

  (seed! [spec]
    (assert db-path "BlueLeaves requires db-path.")
    (assert gz-path "BlueLeaves requires gz-path.")
    (io! "And lots of it!"
         (kickstart! db-path)
         (try (tag-counts (h2/conn! db-path) {})
              (catch Exception _
                (if (.exists ^java.io.File (clj.io/as-file gz-path))
                  (do (try (log/info "Trying to repopulate from GZip.")
                           (h2/pump! (h2/conn! db-path) gz-path true)
                           (catch Exception _
                             (log/info "Failed! Trying to create fi table!")
                             (create-table! (h2/conn! db-path) {})
                             (log/info "Trying to index fi table!")
                             (index-table! (h2/conn! db-path) {}))))
                  (do (log/info "Trying to create fi table!")
                      (create-table! (h2/conn! db-path) {})
                      (log/info "Trying to index fi table!")
                      (index-table! (h2/conn! db-path) {})))))))

  (sync! [spec]
    (assert db-path "BlueLeaves requires db-path.")
    (assert gz-path "BlueLeaves requires gz-path.")
    (io! "And lots of it!"
         (kickstart! db-path)
         (h2/dump! (h2/conn! db-path) gz-path true)))

  (sanitize! [spec]
    (assert gz-path "BlueLeaves requires gz-path.")
    (io! "A little bit of it."
         (.delete ^File (clj.io/as-file (:gz-path spec)))))

  (erase! [spec]
    (assert db-path "BlueLeaves requires db-path.")
    (io! "And lots of it!"
         (kickstart! db-path)
         (h2/raze! (h2/conn! db-path))))

  (harvest! [spec tag]
    (assert db-path "BlueLeaves requires db-path.")
    (future
      (io! "And lots of it!"
           (seed! spec)
           (let [path      (get tags-paths tag)
                 start     (System/nanoTime)
                 harvester (io/sink (assoc proto-harvest :tag tag)
                                    (h2/conn! db-path))]
             (log/info "Deleting tag:" tag)
             (delete-by-tag! (h2/conn! db-path) {:tag tag})
             (log/info "Harvesting tag:" tag)
             (harvester path)
             (log/info "Done tag:" tag)
             {:tag tag
              :time (/ (double (- (System/nanoTime) start)) 1e9)
              :items (:count (tag-count (h2/conn! db-path) {:tag tag}))}))))

  (harvest-all! [spec]
    (assert db-path "BlueLeaves requires db-path.")
    (assert gz-path "BlueLeaves requires gz-path.")
    (future
      (io! "And lots of it!"
           (erase! spec)
           (sanitize! spec)
           (seed! spec)
           (deindex-table! (h2/conn! db-path) {})
           (let [jobs (mapv (partial harvest! spec) (keys tags-paths))
                 outs (mapv deref jobs)]
             (log/info "Done harvesting all!")
             (index-table! (h2/conn! db-path) {})
             (log/info "Dumping at:" gz-path)
             (sync! spec)
             outs))))

  (summary! [spec]
    (assert db-path "BlueLeaves requires db-path.")
    (seed! spec)
    (->> (tag-counts (h2/conn! db-path) {})
         (clojure.pprint/print-table [:tag :count])))

  (write-matches! [spec [source target :as definition]]
    (assert db-path "BlueLeaves requires db-path.")
    (seed! spec)
    (io! "And lots of it!"
         (log/info "Producing matches for:" source "->" target)
         (let [t-m  (tag-match (h2/conn! db-path) source target)
               path (get matches definition)]
           (assert path "Undefined matches!")
           ((io/sink (io/tsv) path) (tag-match-manifest t-m))
           (log/info "Wrote matches to:" path))))

  (write-manifest! [spec [source target :as definition] advisor]
    (assert db-path "BlueLeaves requires db-path.")
    (seed! spec)
    (io! "And lots of it!"
         (log/info "Producing manifest for:" source "->" target)
         (let [t-d  (tag-diff (h2/conn! db-path) source target)
               path (get manifests definition)]
           (assert path "Undefined matches!")
           ((io/sink (io/tsv) path) (tag-diff-manifest t-d advisor))
           (log/info "Wrote manifest to:" path))))

  (write-manifest! [spec [source target :as definition]]
    (assert db-path "BlueLeaves requires db-path.")
    (seed! spec)
    (io! "And lots of it!"
         (let [t-d  (tag-diff (h2/conn! db-path) source target)
               path (get manifests definition)]
           (assert path "Undefined matches!")
           (log/info "Producing manifest for:" source "->" target)
           ((io/sink (io/tsv) path) (tag-diff-manifest t-d super-good-advice))
           (log/info "Wrote manifest to:" path))))

  (write-html-diff! [spec [source target :as definition]]
    (assert db-path "BlueLeaves requires db-path.")
    (letfn [(path [] (get html-diffs definition))
            (style []
              (-> (str "* {margin:0; padding:0; background:%s;"
                       "font-scale:85%%;\n"
                       "font-family:Inconsolata, 'Droid Sans Mono', monospace;}\n"
                       "h1 {font-size:3em; text-align:center; margin:1em;}\n"
                       "ul {list-style:none; margin:4em; font-size:1.2em;}\n"
                       "li {line-height:1.4; white-space:pre;}\n"
                       "hr {border:1px dotted silver;background-color:snow;}\n"
                       ".in {color:%s;}\n"
                       ".out {color:%s;}\n")
                  (format (-> spec :style (:bg "snow"))
                          (-> spec :style (:in "limegreen"))
                          (-> spec :style (:out "orangered")))))
            (page [source target content]
              (html5
                [:head
                 [:title (format "diff: %s v. %s" source target)]
                 [:meta {"charset" "utf-8"}]
                 [:style (style)]]
                [:body
                 [:h1 "diff: " [:span {"class" "out"} source]
                  " v. "
                  [:span {"class" "in"} target]]
                 [:hr]
                 content]))
            (lines-to-li [content]
              (when-not (empty? content)
                (let [lines (-> content
                                (string/escape {\< "&lt;" \> "&gt;" \& "&amp;"})
                                (string/split #"\r?\n"))]
                  [:div
                   (into [:ul] (for [line lines]
                                 (cond
                                   (= (first line) \+)
                                   [:li {"class" "in"} line]
                                   (= (first line) \-)
                                   [:li {"class" "out"} line]
                                   :else
                                   [:li line])))
                   [:hr]])))]
      (io! "And lots of it!"
           (log/info "Producing HTML diff for:" source "->" target)
           (->> (tag-diff (h2/conn! db-path) source target)
                (vals)
                (filter (comp (partial = "BOTH") :presence))
                (sort-by :rel_path)
                (map (comp lines-to-li :patch))
                (page source target)
                (spit (path)))
           (log/info "Wrote HTML diff to:" (path)))))

  (write-all-matches! [spec]
    (log/info "Writing all matches.")
    (doseq [source-target (keys matches)]
      (write-matches! spec source-target))
    (log/info "Done writing all matches."))

  (write-all-manifests! [spec advisor]
    (log/info "Writing all manifests.")
    (doseq [source-target (keys manifests)]
      (write-manifest! spec source-target advisor))
    (log/info "Done writing all manifests."))

  (write-all-manifests! [spec]
    (write-all-manifests! spec super-good-advice))

  (write-all-html-diffs! [spec]
    (log/info "Writing all diffs.")
    (doseq [source-target (keys html-diffs)]
      (write-html-diff! spec source-target))
    (log/info "Done writing all diffs."))

  (write-all! [spec advisor]
    (log/info "Writing everything!")
    (write-all-matches! spec)
    (write-all-manifests! spec advisor)
    (write-all-html-diffs! spec)
    (log/info "Wrote everything!"))

  (write-all! [spec]
    (write-all! spec super-good-advice)))

(defn blue-leaves
  "BlueLeaves be like:

    {:db-path String
     :gz-path String
     :proto-harvest Harvest
     :tags-paths {String String}
     :matches {[String String] String}
     :manifests {[String String] String}
     :html-diffs {[String String] String}}

  "
  ([]
   (map->BlueLeaves {:proto-harvest (harvest {:tag "proto-harvest"})
                     :styles {:in  "limegreen"
                              :out "orangered"
                              :bg  "snow"}
                     }))
  ([spec]
   (into (blue-leaves) spec)))

