;;; 统计行数
(ns com.kahui.spiders.dianping.mr.countline
  (:require [clojure.string :as str]
            [clojure.data.json :as json]
            [clojure.core.reducers :as r]
            [clojure.tools.logging :as logging]
            [parkour.conf :as conf]
            [parkour.fs :as fs]
            [parkour.mapreduce :as mr]
            [parkour.graph :as pg]
            [parkour.tool :as tool]
            [parkour.io (text :as text) (seqf :as seqf) (dseq :as dseq) (dseq :as dseq) (avro :as avro)])
  (:import [org.apache.hadoop.io Text LongWritable NullWritable]
           [com.cdg.kahui.avro MergedFile]
           [org.apache.hadoop.io LongWritable])
  (:gen-class))


(defn mapper
  {::mr/source-as :vals}
  [input]
  (->> input
    (r/map (fn [i] ["count" 1]))))

(defn reducer
  {::mr/source-as :keyvalgroups}
  [input]
  (r/map (fn [[key vals]]
           [key (reduce + 0 vals)]) input))

(defn count-line
  [conf workdir lines]
  (let [out-path (fs/path workdir "count")
        out-dsink (text/dsink out-path)]
    (conf/assoc! conf "mapred.reduce.tasks" 1 "mapred.job.reuse.jvm.num.task" -1)
    (-> (pg/input lines)
      (pg/map #'mapper)
      (pg/partition [Text LongWritable])
      (pg/combine #'reducer)
      (pg/reduce #'reducer)
      (pg/output out-dsink)
      (pg/execute conf "count")
      first)))

(defn tool
  [conf & args]
  (let [[workdir & inpaths] args
        lines (apply avro/dseq [(MergedFile/getClassSchema)] inpaths)]
    (count-line conf workdir lines)))

(defn -main
  [& args] (System/exit (tool/run tool args)))
