(ns emplan.factor
  "Maintaining probabilities for planning and policy learning"
  (:use [embang runtime emit]))

;; Policy learning relies on the `factor' pseudo-distribution
;; to manipulate log weights.

(defdist factor
  "factor pseudo-distribution;
  `sample' returns zero,
  `observe' returns the value as the log-probability."
  []
  (sample [this] 0.0)
  (observe [this value] value))

;; Since `factor' takes no parameters, we define singleton
;; +factor+ to save on re-creating the distribution object.

(def +factor+ "factor singleton" (factor))

;; Log weight of random choices is accumulated during the run,
;; and used to compute the final log weight taking the reward
;; into account.

(defm init-log-wrc []
  "sets log weight of random choices to 0"
  (store ::log-wrc 0.))

(defm add-log-wrc [dist s]
  "updates log weight of random choices"
  (store ::log-wrc (+ (retrieve ::log-wrc)
                      (embang.runtime/observe dist s))))

(defm observe-reward
  "observes total reward of the policy instantiation"
  [reward]
  (let [log-wrc (retrieve ::log-wrc)]
    (observe +factor+
             (- (* reward (exp log-wrc)) log-wrc))))
