;;----------------------------------------------------------------------------
;; Copyright 2011 Factual, Inc.
;; All Rights Reserved.
;;
;; This is UNPUBLISHED PROPRIETARY SOURCE CODE of Factual, Inc.
;; Factual, Inc. reserves all rights in the source code as
;; delivered. The source code and all contents of this file, may not be
;; used, copied, modified, distributed, sold, disclosed to third parties
;; or duplicated in any form, in whole or in part, for any purpose,
;; without the prior written permission of Factual, Inc.
;;----------------------------------------------------------------------------

(ns leafgrabber.extract.authority
  (:use cascalog.api
        [let-else :only [let?]]
        [clojure.walk :only [postwalk]]
        [leafgrabber.register :only [register-extractor-group]])
  (:require [leafgrabber.extract.classify :as clfy]
            [leafgrabber.page :as page]
            [leafgrabber.xpath :as x]
            [jre2 :as j])
  (:import [java.net URL]))

;; In this file ##" is a reader macro for JRE2 regexs, similar to #" for Java Pattern regexs.
;; E.g. ##"foo\sbar" is equivalent to (j/re-pattern "foo\\sbar").


(page/def-get :url-obj
  #(try
     (URL. (:url @%))
     (catch Exception e nil)))

(defn get-href [page node]
  (let? [href (x/attr "href" node) :else nil
         url-obj (get-url-obj page) :else nil
         url (try
               (URL. url-obj href)
               (catch Exception e nil))
         :else nil]
    (str url)))

;; Returns a seq of {:href ... :body ...} maps for all \"a\" tags in the dom.
(page/def-get :anchors
  (fn [page]
    (let [anchors (x/xpath ".//a" (:dom @page))]
      (remove nil?
              (map (fn [anchor]
                     (when-let [href (get-href page anchor)]
                       {:href href
                        :body (x/inner-html anchor)}))
                   anchors)))))

(def css-pattern
  ##"(?i)\bcss\b")

;; Returns a seq of hrefs for all \"link\" tags in the dom.
(page/def-get :css-links
  (fn [page]
    (let [links (:links @page)]
      (filter #(j/re-find css-pattern %)
              (remove nil?
                      (map (partial get-href page) links))))))


(def directions
  "(?:\\s+(?i:N|S|E|W|NE|NW|SE|SW)\\.?)?")

;; one of these, or a comma, is required following the street name
(def street-types
  "(?:\\s+(?:Ave|Avenue|Blvd|Boulevard|Street|Str?|Road|Rd|Drive|Dr|Highway|Hwy|Place|Pl|Lane|Ln|Plaza|Plz|Expressway|Expy|Freeway|Fwy|Parkway|Pk?wy|Way|Circle|Cir|Center|Ctr|Turnpike|Tpke|Court|Ct|Terrace|Terr?)\\.?|\\,)")

(def address-pattern
  (j/re-pattern
   (str
    "("                                 ; start street address group

    "\\b[-\\d]{1,6}"                    ; address number
    directions
    "(?:[-\\s]+[A-Z][a-z]+|\\d{1,3}(?i:st|nd|rd|th))+" ; street name
    street-types
    directions

    ")"                                 ; end street address group

    "(?:[,\\s]+((?i:unit|apt\\.?|suite|ste\\.?|#)\\s*[-\\w]{1,4}))?" ; suite
    "[,\\s]+([A-Z][a-z]+(?:[-\\s]+[A-Z][a-z]+)*)" ; city
    "[,\\s]+"
    clfy/state-zip-pattern
    )))

(def comma-pattern
  ##"^(.*?),*$")

(defn strip-commas [s]
  ((j/re-find comma-pattern s) 1))

(defn pull-addresses [page]
  ;; each address produces a vector [<full match> <street address> <suite> <city> <state> <zip>]
  (let [addresses (or (j/re-seq address-pattern
                                (clfy/get-xhtml-classify page))
                      ())
        addresses (postwalk #(cond (string? %) (strip-commas %)
                                   (nil? %) ""
                                   :else %)
                            addresses)]
    {:addresses addresses
     :num_addr (count addresses)}))

(defn pull-tels [page]
  (let [tels (or (clfy/get-tels page)
                 ())]
    {:tels tels
     :num_tel (count tels)}))

(defn classify-page [page]
  (let [{num-tel :num_tel :as num-tel-map}
        (pull-tels page)

        {num-addr :num_addr :as num-addr-map}
        (pull-addresses page)

        page-class (clfy/classify-features num-tel num-addr)
        ]
    (merge num-tel-map
           num-addr-map
           {:page_class page-class})))



(def link-patterns
  {:contact-us-links #"(?i:contact\s+us)"
   :contact-links    #"(?i:contact)"
   :about-us-links   #"(?i:about\s+us)"
   :directions-links #"(?i:directions)"
   :locat-links      #"(?i:locat)"})

(defn links-containing [re page]
  (remove nil?
          (map #(when (j/re-find re (:body %))
                  (:href %))
               (get-anchors page))))

(defn pull-links [page]
  (into {}
        (map (fn [[k re]]
               [k (links-containing re page)])
             link-patterns)))

(defn pull-css-links [page]
  {:css-links
   (get-css-links page)})


(register-extractor-group
 "authority"
 [classify-page
  pull-links
  pull-css-links])
