package com.ifesdjeen.cascading.cassandra;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;

import org.apache.cassandra.thrift.*;
import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.db.Column;
import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * This counts the occurrences of words in ColumnFamily Standard1, that has a single column (that we care about)
 * "text" containing a sequence of words.
 * <p/>
 * For each word, we output the total number of occurrences across all texts.
 * <p/>
 * When outputting to Cassandra, we write the word counts as a {word, count} column/value pair,
 * with a row key equal to the name of the source column we read the words from.
 */
public class CountTest extends Configured implements Tool {
  private static final Logger logger = LoggerFactory.getLogger(CountTest.class);

  static final String KEYSPACE = "cascading_cassandra";
  static final String COLUMN_FAMILY = "libraries";

  static final String OUTPUT_REDUCER_VAR = "output_reducer";
  static final String OUTPUT_COLUMN_FAMILY = "output_words";
  private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";

  private static final String CONF_COLUMN_NAME = "application";

  public static void main(String[] args) throws Exception {
    // Let ToolRunner handle generic command-line options
    ToolRunner.run(new Configuration(), new CountTest(), args);
    System.exit(0);
  }

  public static class TokenizerMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, Column>, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    private ByteBuffer sourceColumn;

    protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
            throws IOException, InterruptedException {
    }

    public void map(ByteBuffer key, SortedMap<ByteBuffer, Column> columns, Context context) throws IOException, InterruptedException {
      logger.info("yo here");
      word.set("some word");
      context.write(word, one);
    }
  }

  public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable> {
    public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
      logger.info("One");
      Text a = new Text();
      a.set("asdasd");
      for (IntWritable w : values) {
        context.write(a, new IntWritable(1));
      }
    }
  }

  public int run(String[] args) throws Exception {
    String outputReducerType = "filesystem";

    // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better
    ConfigHelper.setRangeBatchSize(getConf(), 50);
    ConfigHelper.setInputSplitSize(getConf(), 50);
    String columnName = CONF_COLUMN_NAME;

    Job job = new Job(getConf(), "wordcount");
    job.setJarByClass(CountTest.class);
    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(ReducerToFilesystem.class);
    job.setReducerClass(ReducerToFilesystem.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));

    job.setInputFormatClass(ColumnFamilyInputFormat.class);

    ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
    ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.60.15");
    ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
    ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
    SlicePredicate predicate = new SlicePredicate().setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
    ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

    // this will cause the predicate to be ignored in favor of scanning everything as a wide row
    ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true);

    ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
    ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");

    job.waitForCompletion(true);
    return 0;
  }
}
