(ns deploy.component.scrape
  (:require [com.stuartsierra.component :as component]
            [deploy.serialize :refer [serialize string-serialize]]
            [ring.mock.request :as mock]
            [clojure.set :as set]
            [clojure.java.io :as io]
            [clojure.spec :as s]
            [deploy.component.config :as config]))



(defn- links [body]
  (let [third #(nth % 2)]
    (->> (concat
          (->> body
               (re-seq #"(href|src)=\"([^\"#\?]+)[\"#\?]"))
          (->> body
               (re-seq #"(url)\('([^\'#\?]+)[\'#\?]")))
         (map third)
         (remove (some-fn (partial re-matches #"https?://.*")
                          (partial re-matches #"mailto:.*")
                          (partial re-matches #"//.*"))))))

(defn- status-digit? [n response]
  (re-matches (re-pattern (str n "\\d\\d")) (str (:status response))))

(defn- status [resp]
  (condp status-digit? resp
    2 :ok
    3 :redirect
    4 :client-error
    :unexpected))

(defn- get-suffix [s]
  (second (re-matches #".*(\..*)" s)))

(defn- get-links [handler uri]
  (when (contains? #{nil "" ".html"} (get-suffix uri))
    (let [resp (handler (mock/request :get uri))]
      ;; TODO: handle redirects
      (when (#{:ok} (status resp))
        (->> resp
             :body
             string-serialize
             links)))))

(defn- spider-step [{:keys [handler visited tovisit] :as state}]
  (if-let [visit-uri (some-> tovisit seq first)]
    (-> state
        (update :visited conj visit-uri)
        (update :tovisit set/union (set/difference (set (get-links handler visit-uri))
                                                   visited))
        (update :tovisit set/difference #{visit-uri}))
    state))

(defn- reachable [handler uri]
  (->> (iterate spider-step
                {:handler handler
                 :visited #{}
                 :tovisit #{uri}})
       (drop-while (comp seq :tovisit))
       (first)
       (:visited)))



(defn- copy [handler uri destdir]
  (try
    (let [resp (handler (mock/request :get uri))]
      (assert (= (:status resp) 200) uri)
      (let [outfile (io/file (str destdir (cond-> uri
                                            (re-matches #".*/$" uri) (str "index.html"))))
            data (-> resp
                     :body
                     serialize)]
        (io/make-parents outfile)
        (io/copy data outfile)

        (println (format "Copied %d bytes to %s" (count data) outfile))))
    (catch Exception e
      (println "Exception" e " on uri" uri))))

(defn- copy-reachable [handler start-uri destdir]
  (doseq [uri (reachable handler start-uri)]
    (copy handler uri destdir)))




(s/def ::handler fn?)
(s/def ::load (s/keys :req-un [::handler]))

(s/def ::component
  (s/keys :req-un [::config/common ::load]))

(s/fdef exec*
        :args (s/cat :component ::component)
        :ret nil?)



(defn- exec [handler destdir]
  (copy-reachable handler "/" destdir))

(defn- exec* [{{:keys [destdir]} :common
                 {:keys [handler]} :load
                 :as component}]
  (exec handler destdir))

(defrecord Scrape []
  component/Lifecycle
  (start [component]
    (println "[scrape] start")
    (exec* component)
    component)
  (stop [component]
    (println "[scrape] stop")
    component))

(defn scrape [& [m]]
  (map->Scrape m))
