(ns signal.data-platform-telemetry.metrics
  (:require [iapetos.core :as prometheus]
            [iapetos.collector.jvm :as jvm]
            [iapetos.collector.ring :as ring]
            [java-time.convert :refer [to-millis-from-epoch]]))

(defn- ms->seconds [time]
  (/ time 1000.0))

(defn take-while-and-n-more
  [pred n coll]
  (lazy-seq
   (when-let [s (seq coll)]
     (if (pred (first s))
       (cons (first s) (take-while-and-n-more pred n (rest s)))
       (take n s)))))

(defn- exponential-buckets
  "Generates exponential buckets based on <exponent>, starting at <start>, and ending with the bucket larger than the <max-value>."
  [start exponent max-value]
  (into [] (take-while-and-n-more #(<= % max-value) 1
                                  (map-indexed (fn [i _] (* start (Math/pow exponent i))) (range)))))

(def ^:dynamic *default-registry* prometheus/default-registry)

;; the histogram keywords contain fake namespaces to make sure they are not prefixed by "default-"
;; note that counters must end with _total to be valid in future Prometheus versions (i.e. under OpenMetrics)
(defn init
  "Initialises a Prometheus metrics registry and returns it."
  ([{:keys [registry
            ring?
            jvm?] :or {ring? true
                       jvm? true
                       registry *default-registry*}}]
   (-> registry
       (cond->
        jvm? jvm/initialize
        ring? ring/initialize)))
  ([]
   (init nil)))

(def ^:private default-processing-stage-buckets-seconds [0.01 0.025 0.05 0.075 0.1 0.25 0.5 1.0 2.0])
(def ^:private default-results-stage-buckets-seconds [0.1 0.25 0.5 0.75 1.0 2.0 4.0 8.0 16.0])


(defn add-exponential-buckets-past-maximum
  ([base-buckets max-time-seconds]
   (if (and max-time-seconds
            (> max-time-seconds (last base-buckets)))
     (concat base-buckets (exponential-buckets (* 2 (last base-buckets)) 2 max-time-seconds))
     base-buckets))
  ([base-buckets]
   base-buckets))

(defn get-default-histogram-buckets
  [{:keys [max-batch-size] :or {max-batch-size 50}}]
  {:message-read-time default-processing-stage-buckets-seconds
   :message-process-time default-results-stage-buckets-seconds
   :message-total-time default-results-stage-buckets-seconds
   :message-batch-size (cond
                         (= 1 0) [0 1]
                         (= max-batch-size 50) [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50]
                         :else (exponential-buckets 25 2 max-batch-size))

   :message-result-write-time default-processing-stage-buckets-seconds
   :message-publish-time default-processing-stage-buckets-seconds
   :message-ack-time default-processing-stage-buckets-seconds

   :pipeline-document-latency nil ;;needs to be set by user
   })


(comment
  (exponential-buckets 25 2 50000)

  (add-exponential-buckets-past-maximum default-processing-stage-buckets-seconds 8))

(defn queue-reader
  "Registers queue reader metrics on the given registry."
  [registry {:keys [histogram-bucket-overrides]
             :as config}]
  (let [histogram-buckets (merge (get-default-histogram-buckets config) histogram-bucket-overrides)]
    (-> registry
        (prometheus/register-lazy
         (prometheus/histogram :read/time
                               {:buckets (:message-read-time histogram-buckets)
                                :description "The time taken to read the data for the message from S3."}))
        (prometheus/register
         (prometheus/histogram :message/process-time
                               {:buckets (:message-process-time histogram-buckets)
                                :description "The time taken to run the core process on the message in seconds."})
         (prometheus/histogram :message/total-time
                               {:buckets (:message-total-time histogram-buckets)
                                :description "The total time for message processing in seconds."})
         (prometheus/histogram :message/batch-size
                               {:buckets (:message-batch-size histogram-buckets)
                                :description "The number of documents in the message."})
         (prometheus/gauge :queue-worker/messages-in-flight {:description "The number of messages in flight being processed."})
         (prometheus/counter :queue-worker/errors-total {:description "Counter of errors occurred when processing messages."})))))

(defn queue-writer
  "Registers queue worker metrics on the given registry."
  [registry {:keys [histogram-bucket-overrides]
             :as config}]
  (let [histogram-buckets (merge (get-default-histogram-buckets config) histogram-bucket-overrides)]
    (-> registry
        (prometheus/register
         (prometheus/histogram :write/time
                               {:buckets (:message-result-write-time histogram-buckets)
                                :description "The time taken to write the output to S3."})
         (prometheus/histogram :sqs/write-time
                               {:buckets (:message-publish-time histogram-buckets)
                                :description "The time taken to publish the output message in seconds. Named sqs for legacy compatibility."}))
        (prometheus/register-lazy
         (prometheus/histogram :ack/time
                               {:buckets (:message-ack-time histogram-buckets)
                                :description "The time taken to acknowledge a message in seconds."})))))

(def ^:private high-watermark-time-seconds
  (atom {}))

(defn pipeline-latency
  "Registers metrics to track latency from point-to-point in a pipeline to the given registry.

   document-latency-buckets must be provided - be sure to include the explicit value of any SLOs in your bucket distribution."
  [registry {:keys [histogram-bucket-overrides]
             :as config}]
  (assert (seq (:pipeline-document-latency histogram-bucket-overrides)) "Field :pipeline-document-latency should be provided in :histogram-bucket-overrides")
  (let [histogram-buckets (merge (get-default-histogram-buckets config) histogram-bucket-overrides)
        registry (-> registry
                     (prometheus/register
                      (prometheus/histogram :pipeline/document-latency-seconds {:description "A histogram of the age of the document from when Signal received the data. (time it was received by Signal - time it was output by the Sink)."
                                                                                :buckets (:pipeline-document-latency histogram-buckets)
                                                                                :labels [:measured_from
                                                                                         :mode]})

                      (prometheus/gauge :pipeline/document-latency-high-water-mark-seconds {:description "The received epoch time in seconds of the last outputted document."
                                                                                            :labels [:measured_from
                                                                                                     :mode]})))]

    (reset! high-watermark-time-seconds {})
    (add-watch high-watermark-time-seconds :record-pipeline-document-processed-high-water-mark-age-seconds
               (fn [_ _ _ val]
                 (doseq [[[measured_from mode] value] val]
                   (prometheus/set registry :pipeline/document-latency-high-water-mark-seconds {:mode mode
                                                                                                :measured_from measured_from} value))))
    registry))

(defn observe-time-seconds
  "Given a metric name and a start/end timestamp in milliseconds, observes the time in seconds between the two times."
  [metric registry start-timestamp-ms end-timestamp-ms]
  (try
    (prometheus/observe registry metric (ms->seconds (- end-timestamp-ms start-timestamp-ms)))
    (catch Exception e
      (throw (ex-info "Failed to observe metric" {:metric metric} e)))))


(defmacro message-read-time [registry & body]
  `(prometheus/with-duration (~registry :read/time)
     ~@body))

(def observe-message-read-time (partial observe-time-seconds :read/time))


(defmacro message-result-write-time [registry & body]
  `(prometheus/with-duration (~registry :write/time)
     ~@body))

(def observe-message-result-write-time (partial observe-time-seconds :write/time))

(defmacro message-publish-time [registry & body]
  `(prometheus/with-duration (~registry :sqs/write-time)
     ~@body))

(def observe-message-publish-time (partial observe-time-seconds :sqs/write-time))

(defmacro message-ack-time [registry & body]
  `(prometheus/with-duration (~registry :ack/time)
     ~@body))

(def observe-message-ack-time (partial observe-time-seconds :ack/time))


(defmacro message-process-time [registry & body]
  `(prometheus/with-duration (~registry :message/process-time)
     ~@body))

(def observe-message-process-time (partial observe-time-seconds :message/process-time))


(defmacro message-total-time [registry & body]
  `(prometheus/with-duration (~registry :message/total-time)
     ~@body))

(def observe-message-total-time (partial observe-time-seconds :message/total-time))


(defn increment-messages-in-flight
  ([registry n]
   (prometheus/inc registry :queue-worker/messages-in-flight n))
  ([registry]
   (increment-messages-in-flight registry 1)))

(defn decrement-messages-in-flight
  ([registry n]
   (prometheus/dec registry :queue-worker/messages-in-flight n))
  ([registry]
   (decrement-messages-in-flight registry 1)))


(defn observe-message-batch-size [registry size]
  (prometheus/observe registry :message/batch-size size))

(defn observe-queue-worker-error [registry _]
  (prometheus/inc registry :queue-worker/errors-total))


(defn- calc-age-seconds [time-seconds]
  (- (ms->seconds (System/currentTimeMillis)) time-seconds))

(defn record-pipeline-metrics-for-complete-document [registry {:keys [mode
                                                                      measured-from]} received-date-time]
  (let [start-time-epoch-seconds (quot (to-millis-from-epoch received-date-time)
                                       1000)]
    (prometheus/observe registry :pipeline/document-latency-seconds {:mode mode
                                                                     :measured_from measured-from} (calc-age-seconds start-time-epoch-seconds))
    ;; do not set a high watermark in the future
    (when (>= (/ (System/currentTimeMillis) 1000) start-time-epoch-seconds)
      (swap! high-watermark-time-seconds update [mode measured-from] #(max start-time-epoch-seconds (or % 0))))))


(defn ring-wrap-instrumentation
  "Wraps a ring application in Prometheus instrumentation."
  [registry app]
  (ring/wrap-instrumentation app
                             registry
                             {:path-fn (fn [_] "/")}))

(defn ring-wrap-metrics-expose
  "Creates a ring handler which exposes Prometheus metrics for the given registry on /internal/metrics."
  [registry app]
  (ring/wrap-metrics-expose app
                            registry
                            {:path "/internal/metrics"}))
