;;; 处理点评的商户数据,根据品牌id聚合商户
(ns com.kahui.spiders.dianping.mr.groupbrand
  (:require [clojure.string :as str]
            [clojure.data.json :as json]
            [clojure.core.reducers :as r]
            [clojure.tools.logging :as logging]
            [parkour.conf :as conf]
            [parkour.fs :as fs]
            [parkour.mapreduce :as mr]
            [parkour.graph :as pg]
            [parkour.tool :as tool]
            [parkour.io.text :as text]
            [parkour.io.dseq :as dseq]
            [parkour.io.dux :as dux])
  (:import [org.apache.hadoop.io Text LongWritable NullWritable])
  (:gen-class))

(defn extract
  "抽取品牌id,规则:优先使用brand-id,如果brand-id没有就使用shopGroupID"
  [input]
  (if-let [input input]
    (let [json-obj (json/read-str input)
          bran-id (json-obj "brand-id")
          script-data (json-obj "script-data")
          shop-id (script-data "shopID")
          shop-group-id (script-data "shopGroupID")
          bran-id (str (some (fn [id] (if (not (nil? id)) id)) [bran-id shop-group-id]))]
      (if (and (not (nil? shop-id)) (not (nil? bran-id)))
        [bran-id input]))))

(defn mapper
  {::mr/source-as :vals}
  [input]
  (->> input
    (r/map extract)
    (r/filter #(not (nil? %)))))

(defn reducer
  {::mr/source-as :keyvalgroups}
  [input]
  (r/map (fn [[brand-id vals]]
           [brand-id (json/write-str (r/reduce conj [] (r/map json/read-str vals)))])
    input))

(defn group-brand
  [conf workdir lines]
  (let [out-path (fs/path workdir)
        dsink (text/dsink (fs/path out-path "group-brand"))]
    (conf/assoc! conf "mapred.reduce.tasks" 8 "mapred.job.reuse.jvm.num.task" -1)
    (-> (pg/input lines)
      (pg/map #'mapper)
      (pg/partition [Text Text])
      (pg/reduce #'reducer)
      (pg/output dsink)
      (pg/execute conf "group-store-by-brandid")
      first)))

(defn tool
  [conf & args]
  (let [[workdir & inpaths] args
        lines (apply text/dseq inpaths)]
    (group-brand conf workdir lines)))

(defn -main
  [& args] (System/exit (tool/run tool args)))
