;;----------------------------------------------------------------------------
;; Copyright 2011 Factual, Inc.
;; All Rights Reserved.
;;
;; This is UNPUBLISHED PROPRIETARY SOURCE CODE of Factual, Inc.
;; Factual, Inc. reserves all rights in the source code as
;; delivered. The source code and all contents of this file, may not be
;; used, copied, modified, distributed, sold, disclosed to third parties
;; or duplicated in any form, in whole or in part, for any purpose,
;; without the prior written permission of Factual, Inc.
;;----------------------------------------------------------------------------

(ns leafgrabber.extract.classify
  (:use [clojure.data.json :only (read-json)]
        [leafgrabber.clean :only [drop-nodes]]
        [leafgrabber.register :only [register-extractor-group]])
  (:require [clojure.string :as s]
            [leafgrabber.xpath :as x]
            [leafgrabber.page :as page]
            [jre2 :as j]))

;; In this file ##" is a reader macro for JRE2 regexs, similar to #" for Java Pattern regexs.
;; E.g. ##"foo\sbar" is equivalent to (j/re-pattern "foo\\sbar").


(defn spacify [xhtml]
  (-> xhtml
      (s/replace #"&nbsp;" " ")
      (s/replace #"<[^<>]*>" " ")
      (s/replace #"\s+" " ")
      s/trim))

(page/def-get :xhtml-classify
  #(let [dom (:dom @%)]
     (drop-nodes ".//head" dom)
     (spacify (x/to-html dom))))

(def states
  ["Alabama"
   "Alaska"
   "Arizona"
   "Arkansas"
   "California"
   "Colorado"
   "Connecticut"
   "Delaware"
   "Florida"
   "Georgia"
   "Hawaii"
   "Idaho"
   "Illinois"
   "Indiana"
   "Iowa"
   "Kansas"
   "Kentucky"
   "Louisiana"
   "Maine"
   "Maryland"
   "Massachusetts"
   "Michigan"
   "Minnesota"
   "Mississippi"
   "Missouri"
   "Montana"
   "Nebraska"
   "Nevada"
   "New Hampshire"
   "New Jersey"
   "New Mexico"
   "New York"
   "North Carolina"
   "North Dakota"
   "Ohio"
   "Oklahoma"
   "Oregon"
   "Pennsylvania"
   "Rhode Island"
   "South Carolina"
   "South Dakota"
   "Tennessee"
   "Texas"
   "Utah"
   "Vermont"
   "Virginia"
   "Washington"
   "West Virginia"
   "Wisconsin"
   "Wyoming"])

(def state-zip-pattern
  (j/re-pattern (str "\\b(A[LKSZRAP]|C[AOT]|D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY]|"
                     (s/join \| states)
                     ")\\W+(\\d{5}(?:[ -]*\\d{4})?)\\b")))

(defn count-state-zip [page]
  {:num_state_zip
   (count (j/re-seq state-zip-pattern
                    (get-xhtml-classify page)))})

(def tel-pattern
  ##"\b([0-9]{3})\W+([0-9]{3})\W+([0-9]{4})\b")

(page/def-get :tels
  (fn [page]
    (let [xhtml (get-xhtml-classify page)]
      (distinct
       (map #(str "(" (% 1) ") " (% 2) "-" (% 3))
            (j/re-seq tel-pattern
                      (get-xhtml-classify page)))))))

(defn count-tel [page]
  {:num_tel
   (count (get-tels page))})

(defn classify-features
  [num-tel num-state-zip]

  (if (> 9 (+ num-tel
              num-state-zip))
    "leaf"
    "directory"))

(defn classify-page [page]
  (let [{num-tel :num_tel :as num-tel-map}
        (count-tel page)

        {num-state-zip :num_state_zip :as num-state-zip-map}
        (count-state-zip page)

        page-class (or (:page-class (:pipeline @page))
                       (classify-features num-tel num-state-zip))]

    (merge num-tel-map
           num-state-zip-map
           {:page_class page-class})))


(register-extractor-group
 "classify"
 [classify-page

  ;; count-tel
  ;; count-state-zip
  ])
