# Note: by default model file is /tmp/models  for the R model
# Build model: ruby system_model.rb -build (-m<model file to output>) (-r<ratio of neg to positive>) <training data>"
# Run model: ruby system_model.rb -classify (-m<model file to use>) <test data>"
#    this will output 0 or 1's for each candidate in order

require "csv"

@root="/tmp/models"

# keep negatives to be 1.5x positives for now
def gen_train_file(file_model, file_train, neg_pos_limit=1.5)
  
  train_length=`wc #{file_train}`.split[0].to_i-1
  pos=`grep '^1,' #{file_train} | wc`.split[0].to_i
  neg=train_length-pos
  STDERR.puts "neg to pos ratio:  #{neg}/#{pos}=#{neg/pos}"
  sample= train_length
  neg_pos_limit= neg_pos_limit || 1.5
  if neg/pos>neg_pos_limit
    sample=pos+pos*neg_pos_limit
    STDERR.puts "** Reducing neg size to #{neg_pos_limit}x of pos **"
  end
  STDERR.puts "sample: #{sample}"

  # first build training data
  `head -1 #{file_train} | cut -d, -f5- > #{file_model}.header` # remove debug fields
  #STDERR.puts `cat #{file_model}.header`
  length=`wc #{file_train}`.split[0].to_i-1
  `tail -#{length} #{file_train} > #{file_model}.all_train`
  `awk 'BEGIN { srand() } { print rand() "\t" $0 }' #{file_model}.all_train | sort -n | cut -f2- > #{file_model}.all_train.random`  #randomize before under sampling neg
  
  `grep '^0,' #{file_model}.all_train.random > #{file_model}.all_train.sorted`
  `grep '^1,' #{file_model}.all_train.random >> #{file_model}.all_train.sorted`

  `tail -#{sample.to_i} #{file_model}.all_train.sorted > #{file_model}.train`
  `cat #{file_model}.header > #{file_model}.train_x`
  `cut -d, -f5- #{file_model}.train >>#{file_model}.train_x`
  `cut -d, -f1 #{file_model}.train >#{file_model}.train_y`
end

if  $PROGRAM_NAME == __FILE__
  if ARGV[0]=="-train"
    gen_train_file(ARGV[1], ARGV[2], ARGV[3].to_f)
  end
end
