;;; 大众点评商户数据爬虫
(ns com.kahui.spiders.dianping.spider
  (:import [java.util.concurrent BlockingQueue LinkedBlockingQueue]
           [java.util.concurrent.atomic AtomicBoolean AtomicInteger])
  (:require [clj-http.client :as http-client]
            [clojure.tools.logging :as logging]
            [clojure.java.io :as io]
            [clojure.data.codec.base64 :as b64]
            [com.kahui.spiders.tools.utils :as utils]))

(def default-headers {"User-Agent" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
                      "Accept" "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
                      "Accept-Encoding" "gzip,deflate"})

;:headers default-headers
(def opts {
            :socket-timeout 10000
            :conn-timeout 10000
            :read-timeout 10000
            :throw-exceptions false
            :max-redirects 3})

;; 请求点评的数据需要记录cookie,防止被点评封掉
(def ^{:dynamic true} *my-cs* nil)
(def ^{:dynamic true} *pre-cookies* nil)
;; proxy 代理池
(def ^{:dynamic true} *proxy-pool* nil)
;; 任务
(def ^{:dynamic true} *task-queue* nil)
(def ^{:dynamic true} *task-count* nil)

(defonce helper-agent (agent nil))


(defn- retry
  [task-item task-queue task-count]
  (let [{business-id :id process-count :process-count} task-item]
    (if (and task-queue task-count)
      (cond (<= (.get process-count) 10)
        (do
          (.incrementAndGet task-count)
          (send helper-agent (fn [state]
                               (.put task-queue task-item))))
        :else (logging/warn "Drop id " business-id " " "after process count" process-count)))))

(defn- create-store-file
  "创建存储的文件"
  [data-dir business-id]
  (let [business-id (Integer/parseInt (str business-id))]
    (io/file data-dir (str (unchecked-divide-int business-id 10000)) (str business-id ".html.gz"))))


(defn down-business
  "下载指定id的点评商户页面,"
  [task-item data-dir]
  (let [
         {business-id :id process-count :process-count} task-item
         my-cs *my-cs*
         pre-cookies *pre-cookies*
         proxy (if (nil? *proxy-pool*) nil (.get *proxy-pool*))
         proxy-opts (if (nil? proxy) {} {:proxy-host (.getHost proxy) :proxy-port (.getPort proxy)})
         cookie-session-id (if (nil? proxy) {} {"JSESSIONID" {:discard true, :path "/", :value (.getSessionId proxy), :version 0}})]
    (try
      (.incrementAndGet process-count)
      (let [url (str "http://www.dianping.com/shop/" business-id)
            auth (when-not (nil? proxy)
                   (let [user (.getUser proxy) pass (.getPassword proxy)]
                     (if-not (or (empty? user) (empty? pass))
                       {"Proxy-Authorization" (str "Basic " (String. (b64/encode (.getBytes (str user ":" pass)))))})))
            headers {:headers (merge default-headers (if-not (nil? auth) auth {}))}
            rquest-opts (merge opts headers {:cookie-store my-cs} proxy-opts)
            rquest-opts (if (or (nil? my-cs) (= 0 (count (.getCookies my-cs)))) (merge rquest-opts {:cookies (merge pre-cookies cookie-session-id)}) rquest-opts)
            resp (http-client/get url rquest-opts)
            status (:status resp)
            body (:body resp)]
        (cond
          (= 404 status) (logging/info "Not found" business-id)
          ;判断返回的结果中是否有大众点评,如果没有就视为下载失败,重新下载
          (and (= 200 status) (>= (.indexOf body "大众点评网") 0)) (let [f (create-store-file data-dir business-id)]
                                                                (io/make-parents f)
                                                                (utils/write-gzip-file f body))
          :else (do
                  (logging/warn "id:" (str business-id) "," "proxy:" proxy "," "resp:" (str resp))
                  (when-not (nil? proxy)
                    (.incrementErrorCount proxy))
                  (retry task-item *task-queue* *task-count*)))
        (when (nil? proxy)
          (Thread/sleep (+ 5000 (* (Math/random) 100)))))
      (catch Exception e
        (logging/error e "id:" (str business-id) "," "proxy:" proxy)
        (if (instance? java.io.IOException e)
          (when-not (nil? proxy)
            (.incrementErrorCount proxy)))
        (retry task-item *task-queue* *task-count*))
      (finally
        (when-not (nil? proxy)
          (.put *proxy-pool* proxy))))))

(defn batch-down
  "批量下载点评商户页面"
  [ids threads cookie data-dir]
  (logging/info "*proxy-poll*" *proxy-pool*)
  (let [queue (LinkedBlockingQueue. 100)
        task-count (AtomicInteger. 0)
        pre-cookies (if (and (not= nil cookie) (> (count (.trim cookie)) 0)) (clj-http.cookies/decode-cookies (into [] (.split (.trim cookie) ";"))))
        agent-running (:running (utils/agent-batch-process threads (fn [id]
                                                                     (binding [*my-cs* (clj-http.cookies/cookie-store)
                                                                               *pre-cookies* pre-cookies
                                                                               *task-queue* queue
                                                                               *task-count* task-count]
                                                                       (try
                                                                         (down-business id data-dir)
                                                                         (finally
                                                                           (.decrementAndGet task-count))))) queue))]
    (doseq [id ids]
      (let [store-file (create-store-file data-dir id)]
        (if-not (and (.exists store-file) (> (.length store-file) 1024)) ;当文件已经存在,且长度大于1K时,不下载该ID
          (do
            (.put queue {:id id :process-count (AtomicInteger. 0)})
            (.incrementAndGet task-count))
          (do
            (logging/info "Skip id " id ",because exists")))))
    (while (not= (.get task-count) 0)
      (do
        (logging/info "task-count" (.get task-count))
        (Thread/sleep 1000)))
    (.set agent-running false)
    (Thread/sleep 1000)))
