(ns dbscan.core
  "functional implementation of the DBSCAN clustering algorithm as per
  'https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf'. Note that the functional
  implementation is quite different from the original proposed algorithm:
  - region-query is called at the beginning of the algorithm for all the points
  - no point is marked as visited but rather each point is compared to the
    current neighborhood"
  (:require
    [clojure.set :as cset]
    [dbscan.symmat :as sym]
    [dbscan.util :refer [euclidean-distance]]))

; the return includes the target point itself
(defn- region-query
  "Based on a point to point distance matrix (p2p-dist) find all point-indexes
  whose distance to the target point is less than eps."
  [p2p-dist eps target]
  (keep-indexed
    (fn [index value] (when (< value eps) index))
    (sym/symmetric-row p2p-dist target)))

(defn- build-query-fn
  "based on the provided data, compute the distance among all points and fix
  that information on the region-query function."
  [data dist-fn]
  (let [p2p-dist (sym/compute-triangular (count data)
                   (fn [i j] (dist-fn (get data i) (get data j))))]
    (partial region-query p2p-dist)))

; This function can be parallelized provided that query-fn and dist-fn
;  don't have any side effects
(defn- find-relations
  "based on the query function and the eps distance, build a hash-map of
  [index neighbors]"
  [data query-fn eps]
  (into {}
    (map (fn [index] [index (query-fn eps index)])
      (range (count data)))))

(defn- not-clustered?
  "check if a point is not present on any cluster.
  clusters is a sequence of hash-sets"
  [clusters [index _]]
  (not-any? (fn [cluster] (cluster index)) clusters))

(defn- enough-neighbors?
  [[index neighborhood] min-pts]
  (when (< min-pts (count neighborhood))
    index))

(defn- cluster
  "Cluster points around the seed-index that have more than min-pts neighbors.
  The unclassified hash-map is used to know the neighbors of each point"
  [seed min-pts unclassified]
  (let [min-pts (inc min-pts)]
    (loop [neighbors (into #{} (unclassified seed))
           edge neighbors]                                  ; points that are currently being analyzed
      (let [add-missing-neighbor (fn [tcol neighbor]
                                   (if (neighbors neighbor)
                                     tcol
                                     (conj! tcol neighbor)))
            new-neighbors (persistent! (reduce (fn [tcol e]
                                                 (let [t (unclassified e)]
                                                   (if (<= min-pts (bounded-count min-pts t))
                                                     (reduce add-missing-neighbor tcol t)
                                                     tcol)))
                                         (transient #{})
                                         edge))]
        (if (empty? new-neighbors)
          neighbors
          (recur (cset/union neighbors new-neighbors) new-neighbors))))))

(defn dbscan
  "cluster the data points based on the distance eps and the requirement that
  at least minpts are nearby. Optionally a particular distance and query function
  can be used. Those default to the Euclidean distance and an square distance
  matrix.
  The return value is of the form (clusters noise), where clusters is a vector
  of sets and noise is a simple sequence."
  ([data eps min-pts]
   (dbscan data eps min-pts euclidean-distance (build-query-fn data euclidean-distance)))

  ([data eps min-pts dist-fn]
   (dbscan data eps min-pts dist-fn (build-query-fn data dist-fn)))

  ([data eps min-pts dist-fn query-fn]
   (loop [clusters []
          unclassified (find-relations data query-fn eps)
          seed (some #(enough-neighbors? % min-pts) unclassified)]
     (if (nil? seed)
       [clusters (map first unclassified)]                  ; clusters, noise
       (let [c (cluster seed min-pts unclassified)
             curr-clusters (conj clusters c)
             still-unclassified (into {} (filter #(not-clustered? curr-clusters %) unclassified))
             new-seed (some #(enough-neighbors? % min-pts) still-unclassified)]
         (recur curr-clusters still-unclassified new-seed))))))
