;;; 对比两个批次的商户数据的变更记录
(ns com.kahui.spiders.dianping.mr.storediff
  (:require [clojure.string :as str]
            [clojure.data.json :as json]
            [clojure.core.reducers :as r]
            [clojure.tools.logging :as logging]
            [parkour.conf :as conf]
            [parkour.fs :as fs]
            [parkour.mapreduce :as mr]
            [parkour.graph :as pg]
            [parkour.tool :as tool]
            [parkour.io.text :as text]
            [parkour.io.dseq :as dseq]
            [parkour.io.dux :as dux]
            [com.kahui.spiders.dianping.tools :as tools])
  (:import [org.apache.hadoop.io Text LongWritable NullWritable]
           [org.apache.hadoop.mapreduce TaskInputOutputContext InputSplit]
           [org.apache.hadoop.mapreduce.lib.input FileSplit]
           [org.apache.hadoop.fs Path]
           [java.util.regex Pattern])
  (:gen-class))

;; 定义常量
(defonce old-input-pattern-key "diff.old.pattern")
(defonce new-input-pattern-key "diff.new.pattern")
(defonce type-old "old")
(defonce type-new "new")

(def mapper-old-regex (atom nil))
(def mapper-new-regex (atom nil))

(defn get-atom-regex
  [atom-ref regex]
  (let [value @atom-ref]
    (when (nil? value)
      (if (not (nil? regex))
        (do (reset! atom-ref (Pattern/compile regex)))
        (throw (IllegalArgumentException. (str "No regex pattern " atom-ref))))))
  @atom-ref)

(defn check-input-type
  "根据输入的路径使用正则规则检查是old还是new"
  [input-path conf]
  (let [old-pattern (get-atom-regex mapper-old-regex (.get conf old-input-pattern-key))
        new-pattern (get-atom-regex mapper-new-regex (.get conf new-input-pattern-key))]
    (cond
      (.find (.matcher old-pattern input-path)) type-old
      (.find (.matcher new-pattern input-path)) type-new
      :else (throw (IllegalStateException. (str "Can't match the pattern for input path " input-path " old-pattern:" old-pattern " new-pattern:" new-pattern))))))

(defn parse-input-type
  [input]
  (let [^TaskInputOutputContext context parkour.mapreduce/*context*
        ^FileSplit split (.getInputSplit context)
        ^String input-path (.toString (.getPath split))
        conf (.getConfiguration context)
        type (check-input-type input-path conf)
        json-obj (json/read-json input)
        script-data (:script-data json-obj)
        shop-id (:shopID script-data)]
    (when (not (nil? shop-id))
      [shop-id (json/write-str (merge json-obj {:_type type}))])))

(defn mapper
  "mapper job"
  {::mr/source-as :vals}
  [input]
  (->> input
    (r/map parse-input-type)
    (r/filter #(not (nil? %)))))

;;需要进行对比的字段名称
(defonce field-ks [[:name]
                   [:telphone]
                   [:address]
                   [:script-data :cityID]
                   [:script-data :regionID]
                   [:script-data :shopGroupName]
                   [:script-data :shopGroupID]
                   [:script-data :poi]
                   [:script-data :categoryID]
                   [:brand-id]
                   [:category]
                   [:sub-category1]
                   [:sub-category2]
                   [:business-district]])

(defn diff-input
  "检查新旧数据的变更"
  [k vals]
  (with-local-vars [old-val nil new-val nil]
    (doseq [v vals]
      (let [json-obj (json/read-json v)
            type (:_type json-obj)]
        (cond
          (= type-old type) (var-set old-val json-obj)
          (= type-new type) (var-set new-val json-obj))))
    (let [ov (var-get old-val) nv (var-get new-val)]
      (cond
        ;旧值不为空,新值为空: delete
        (and (not= nil ov) (nil? nv)) [:delete (dissoc ov :_type)]
        ;旧值为空,新值不为空: add
        (and (nil? ov) (not= nil nv)) [:add (dissoc nv :_type)]
        ;旧值不为空,新值也不为空
        ;检查核心是否有变更:商户名称,商户电话,商户地址,商户区域,商户类别,有变化则返回changed
        (and (not= nil ov) (not= nil nv)) (let [ov (dissoc ov :_type)
                                                nv (dissoc nv :_type)
                                                changed (some (fn [k]
                                                                (let [o-k-v (get-in ov k)
                                                                      n-k-v (get-in nv k)]
                                                                  (not= o-k-v n-k-v))) field-ks)]
                                            (when changed
                                              [:change nv]))))))

(defn reducer
  "reducer job"
  {::mr/adapter mr/contextfn}
  [conf]
  (fn [context input]
    (let [vals (mr/keyvalgroups input)]
      (doseq [kv-pairs vals]
        (let [[k vals] kv-pairs
              diff-ret (diff-input k vals)]
          (if-let [diff-ret diff-ret]
            (dux/write context (first diff-ret) k (json/write-str (second diff-ret)))))))))

(defn mr
  [conf workdir lines]
  (let [out-path (fs/path workdir)
        add-sink (text/dsink (fs/path out-path "add"))
        delete-sink (text/dsink (fs/path out-path "del"))
        change-sink (text/dsink (fs/path out-path "changed"))]
    (conf/assoc! conf "mapred.job.reuse.jvm.num.task" -1)
    (->
      (pg/input lines)
      (pg/map #'mapper)
      (pg/partition [LongWritable Text])
      (pg/reduce #'reducer)
      (pg/output :add add-sink :delete delete-sink :change change-sink)
      (pg/execute conf "storediff")
      first)))

(defn tool
  "
  workdir: 工作目录
  old-input-pattern: 输入的旧文件的正则模式
  new-input-pattern: 输入的新文件的正则模式
  input-paths: 需要对比的文件
  "
  [conf & args]
  (let [[workdir old-input-pattern new-input-pattern & input-paths] args
        lines (apply text/dseq input-paths)
        conf (conf/assoc! conf old-input-pattern-key old-input-pattern new-input-pattern-key new-input-pattern)]
    (logging/info "old-input-pattern:" old-input-pattern "new-input-pattern:" new-input-pattern "input-paths:" input-paths)
    (mr conf workdir lines)))

(defn -main
  [& args] (System/exit (tool/run tool args)))
