;;; SPDX-License-Identifier: MPL-2.0
;;;
;;; This Source Code Form is subject to the terms of the Mozilla Public License,
;;; v. 2.0. If a copy of the MPL was not distributed with this file, You can
;;; obtain one at http://mozilla.org/MPL/2.0/.
(ns bootstring-clj.core
  "An implementation of the bootstring encoding for unicode strings.

  Each of the [[encode]] and [[decode]] functions require being passed a
  bootstring spec, a map with namespaced keywords for all the parameters to the
  bootstring algorithm. The [[punycode]] encoding spec is made available as well
  to allow using this standard encoding more easily. If you just need to use
  punycode to encode domain names, prefer using methods on [[java.net.IDN]]
  which already handles additional work and assertions for converting domain
  names.

  A bootstring spec consists of the following keys, all of which are namespaced
  under `bootstring-clj.core`:
  - ::delimiter a string-encoded regex pattern to match the delimiter (punycode: \"-\")
  - ::digits a string with characters at indices representing their encoded value (punycode: \"abcdefghijklmnopqrstuvwxyz0123456789\")
  - ::base the numeric base the encoding is based on, should be equal to or smaller than the length of the digits string (punycode: 36)
  - ::initial-n the lowest codepoint that will be encoded as a delta value (punycode: 128)
  - ::initial-bias bias value for the generalized variable-length integer threshold function (punycode: 72)
  - ::tmin the minimum threshold value for generalized variable-length integers (punycode: 1)
  - ::tmin the maximum threshold value for generalized variable-length integers (punycode: 26)
  - ::damp the damping value used during the first iteration of adjusting the bias value (punycode: 700)
  - ::skew a value added to the bias when it is adapted (punycode: 38)

  The implementation was based on the explanation available on frereit's blog in
  two parts, one on [decoding](https://frereit.de/bootstring_decoding/), and one
  on [encoding](https://frereit.de/bootstring_encoding/)."
  (:require
   [clojure.string :as str])
  (:import
   (java.util.function IntConsumer)))

(set! *warn-on-reflection* true)

(defn- over
  [f g]
  (fn
    ([x] (f (g x)))
    ([w x] (f (g w) (g x)))
    ([w x & more] (apply f (g w) (g x) (map g more)))))

(defn- write-delta-value
  [^StringBuilder encoded value base ^String digits bias tmin tmax]
  (letfn [(threshold [idx]
            (min (max (- (* base (inc idx)) bias) tmin) tmax))
          (weight [idx]
            (if-not (zero? idx)
              (* (weight (dec idx)) (- base (threshold (dec idx))))
              1))]
    (loop [i 0
           value (long value)]
      (let [th (threshold i)]
        (if (>= value th)
          (let [v-t (- value th)
                b-t (- base th)
                digit (nth digits (+ th (mod v-t b-t)))]
            (.append encoded digit)
            (recur (inc i) (long (quot v-t b-t))))
          (.append encoded (nth digits value)))))))

(defn- adapt-bias
  [delta total base tmin tmax damp skew]
  (let [delta (quot delta damp)
        delta (+ (quot delta total) delta)
        max-delta (quot (* (- base tmin) tmax) 2)
        [delta k]
        (loop [delta delta
               k 0]
          (if (> delta max-delta)
            (recur (quot delta (- base tmin)) (inc k))
            [delta k]))]
    (+ (* base k)
       (quot (* (+ base (- tmin) 1) delta)
             (+ delta skew)))))

(defn encode
  "Returns a bootstring-encoded string using the encoding specified by `spec`."
  [spec ^String string]
  (let [{:keys [::digits ::base ::tmin ::tmax ::damp ::skew]} spec
        initial-n (::initial-n spec)
        encoded (StringBuilder.)
        delta-codepoints
        (let [idx (volatile! -1)
              acc (volatile! (sorted-set))]
          (.. string codePoints
              (forEachOrdered
               (reify IntConsumer
                 (accept [_this codepoint]
                   (let [idx (vswap! idx inc)]
                     (if (< codepoint initial-n)
                       (.appendCodePoint encoded (int codepoint))
                       (vswap! acc conj [codepoint idx])))))))
          @acc)
        insertion-points (inc (.length encoded))]
    (when (pos? (.length encoded))
      (.append encoded ^String (::delimiter spec)))
    (loop [n initial-n
           i 0
           step 0
           insertion-points insertion-points
           bias (::initial-bias spec)
           delta-codepoints delta-codepoints
           delta-ordered-codepoints
           (into (sorted-set-by (over < second)) delta-codepoints)]
      (if (seq delta-codepoints)
        (let [[codepoint idx :as target-codepoint] (first delta-codepoints)
              offset (count (take-while (complement #{target-codepoint}) delta-ordered-codepoints))
              target-n codepoint
              target-i (- idx offset)
              delta-steps (if (> target-i i)
                            (+ (* (- target-n n) insertion-points) (- target-i i))
                            (+ (* (- target-n n 1) insertion-points) (+ insertion-points (- target-i i))))]
          (write-delta-value encoded delta-steps base digits bias tmin tmax)
          (recur target-n (long (inc target-i)) (long (+ step delta-steps 1)) (inc insertion-points)
                 (adapt-bias delta-steps insertion-points base tmin tmax (if (zero? step) damp 2) skew)
                 (rest delta-codepoints)
                 (disj delta-ordered-codepoints target-codepoint)))
        (str encoded)))))

(defn- parse-delta-value
  [^String delta base ^String digits bias tmin tmax]
  (letfn [(threshold [idx]
            (min (max (- (* base (inc idx)) bias) tmin) tmax))
          (weight [idx]
            (if-not (zero? idx)
              (* (weight (dec idx)) (- base (threshold (dec idx))))
              1))]
    (loop [i 0
           value 0]
      (let [digit (try (.indexOf digits (int (.charAt delta i)))
                       (catch IndexOutOfBoundsException ioob
                         (throw (IllegalArgumentException. "Invalid bootstring encoded delta values." ioob))))
            _ (when (neg? digit)
                (throw (IllegalArgumentException. "Invalid bootstring encoded delta values.")))
            th (threshold i)
            w (weight i)
            value (int (+ (* w digit) value))]
        (if-not (< digit th)
          (recur (inc i) value)
          [value (subs delta (inc i))])))))

(defn- delim-patterns*
  [delim]
  (re-pattern (str "(?s)^((.*)" delim ")?(.*)$")))

(def ^:private delim-patterns (memoize delim-patterns*))

(defn decode
  "Returns a decoded unicode string using the bootstring encoding specified by `spec`."
  [spec ^String string]
  ;; split the string into the literal and delta portions
  (let [[_ _ ^String literal delta :as match] (re-matches (delim-patterns (::delimiter spec)) string)
        literal (or literal "")]
    ;; initialize a state machine
    (cond
      (not match) (throw (IllegalArgumentException. "Invalid bootstring encoded input."))
      (str/blank? delta) literal
      :else
      (let [{:keys [::digits ::base ::tmin ::tmax ::damp ::skew]} spec
            extended-string (StringBuilder. literal)]
        (loop [delta delta
               n (::initial-n spec)
               i 0
               steps 0
               total (.length extended-string)
               bias (::initial-bias spec)]
          ;; decode a delta value
          (let [[dv more] (parse-delta-value delta base digits bias tmin tmax)
                ;; advance the state machine to insert the code point
                i+dv (+ i dv)
                str-len+1 (inc total)
                n (+ n (quot i+dv str-len+1))
                i (int (mod i+dv str-len+1))
                total (inc total)]
            (.insert extended-string (.offsetByCodePoints extended-string 0 i) (Character/toChars (int n)))
            ;; continue while there is more content in the delta portion
            (if-not (str/blank? more)
              (recur more n (inc i) (int (+ steps dv 1)) (int total) ; inc i because after the insert need to be on next point
                     ;; adapt the bias
                     (adapt-bias dv total base tmin tmax (if (zero? steps) damp 2) skew))
              (str extended-string))))))))

(def punycode
  "A bootstring spec value for the punycode encoding."
  {::delimiter "-"
   ::digits "abcdefghijklmnopqrstuvwxyz0123456789"
   ::base 36
   ::initial-n 128
   ::initial-bias 72
   ::tmin 1
   ::tmax 26
   ::damp 700
   ::skew 38})
