;;; 解析点评商户页面,抽取数据,生成JSON格式的数据
(ns com.kahui.spiders.dianping.mr.parser
  (:require [clojure.string :as str]
            [clojure.data.json :as json]
            [clojure.core.reducers :as r]
            [clojure.tools.logging :as logging]
            [parkour.conf :as conf]
            [parkour.fs :as fs]
            [parkour.mapreduce :as mr]
            [parkour.graph :as pg]
            [parkour.tool :as tool]
            [parkour.io.text :as text]
            [parkour.io.dseq :as dseq]
            [parkour.io.dux :as dux]
            [parkour.io.avro :as avro]
            [com.kahui.spiders.dianping.jsoup-parser :as jsoup-parser]
            [com.kahui.spiders.dianping.tools :as tools])
  (:import [org.apache.hadoop.io Text LongWritable NullWritable]
           [com.cdg.kahui.avro MergedFile])
  (:gen-class))

(defonce close-status "1")
(defonce open-status "2")
(defonce pause-status "3")
(defonce error-status "4")

(def statuses {close-status :close open-status :open pause-status :pause error-status :error})

(defn gen-error-source
  [input name]
  [error-status (json/write-str {:source (str input) :name name})])

(defn extract
  [input]
  (if-let [content (:content input)]
    (let [name (:name input)]
      (if-let [input-text (String. content 0 (alength content) "UTF-8")]
        (try
          (let [json-obj (jsoup-parser/parse-all input-text)
                shopId (:shopID (:script-data json-obj))
                close (:close json-obj)
                pause (:pause json-obj)
                parse-dp-poi (tools/parse-dp-store-poi json-obj)
                ^String json-str (json/write-str (merge json-obj {:loc parse-dp-poi}))
                ^String target (if (nil? shopId) error-status
                                 (cond
                                   (= true pause) pause-status (= true close) close-status (= false close) open-status :else error-status))]
            (cond
              (or (= error-status target) (nil? json-str)) (gen-error-source input-text name)
              :else [target json-str]))
          (catch Exception e
            (logging/error e "name:" name)
            (gen-error-source input-text name)))))))

(defn mapper
  {::mr/source-as :keys}
  [input]
  (->> input
    (r/map extract)
    (r/filter #(and (not (nil? %)) (not (nil? (first %)))))))

(defn reducer
  {::mr/adapter mr/contextfn}
  [conf]
  (fn [context input]
    (let [vals (mr/keyvalgroups input)]
      (doseq [kv-pairs vals]
        (let [[k vs] kv-pairs]
          (doseq [v vs]
            (dux/write context (statuses k) (NullWritable/get) v)))))))

(defn parse-html
  [conf workdir lines]
  (let [out-path (fs/path workdir)
        open-dsink (text/dsink (fs/path out-path "open"))
        close-dsink (text/dsink (fs/path out-path "close"))
        pause-dsink (text/dsink (fs/path out-path "pause"))
        error-dsink (text/dsink (fs/path out-path "error"))]
    (conf/assoc! conf "mapred.reduce.tasks" 4 "mapred.job.reuse.jvm.num.task" -1)
    (-> (pg/input lines)
      (pg/map #'mapper)
      (pg/partition [Text Text])
      (pg/reduce #'reducer)
      (pg/output :open open-dsink :close close-dsink :pause pause-dsink :error error-dsink)
      (pg/execute conf "parse-html")
      first)))

(defn tool
  [conf & args]
  (let [[workdir & inpaths] args
        lines (apply avro/dseq [(MergedFile/getClassSchema)] inpaths)]
    (parse-html conf workdir lines)))

(defn -main
  [& args] (System/exit (tool/run tool args)))
