(ns com.kahui.spiders.dianping.mr.text
  (:require [parkour (conf :as conf) (fs :as fs)]
            [parkour.io (dseq :as dseq) (dsink :as dsink)])
  (:import [org.apache.hadoop.mapreduce Job]
           [org.apache.hadoop.mapreduce.lib.input FileInputFormat]
           [org.apache.hadoop.mapreduce.lib.input TextInputFormat]
           [org.apache.hadoop.mapreduce.lib.output TextOutputFormat]
           [org.apache.hadoop.mapreduce.lib.output FileOutputFormat]
           [org.apache.hadoop.io Text NullWritable]
           [com.kahui.hadoop NonSplittableTextInputFormat]))

(defn dseq-nosplit
  "Distributed sequence of input text file lines.  Tuples consist
of (file offset, text line)."
  [& paths]
  (dseq/dseq
    (fn [^Job job]
      (.setInputFormatClass job NonSplittableTextInputFormat)
      (FileInputFormat/setInputPaths job (fs/path-array paths)))))

(defn dseq
  "Distributed sequence of input text file lines.  Tuples consist
of (file offset, text line)."
  [& paths]
  (dseq/dseq
    (fn [^Job job]
      (.setInputFormatClass job TextInputFormat)
      (FileInputFormat/setInputPaths job (fs/path-array paths)))))

(defn dsink
  "Distributed sink for line-delimited text output.  Produces one line
per tuple containing TAB-separated results of invoking `.toString`
method of tuple members."
  [path]
  (dsink/dsink
    (dseq path)
    (fn [^Job job]
      (doto job
        (.setOutputFormatClass TextOutputFormat)
        (.setOutputKeyClass NullWritable)
        (.setOutputValueClass Text)
        (FileOutputFormat/setOutputPath (fs/path path))))))