;;----------------------------------------------------------------------------
;; Copyright 2011 Factual, Inc.
;; All Rights Reserved.
;;
;; This is UNPUBLISHED PROPRIETARY SOURCE CODE of Factual, Inc.
;; Factual, Inc. reserves all rights in the source code as
;; delivered. The source code and all contents of this file, may not be
;; used, copied, modified, distributed, sold, disclosed to third parties
;; or duplicated in any form, in whole or in part, for any purpose,
;; without the prior written permission of Factual, Inc.
;;----------------------------------------------------------------------------


;; A "page" or "page map" is an atom containing a map with some or all of these keys:
;; :status :headers :body :xhtml :xhtml-classify :url :dom :script :img :uuid


(ns leafgrabber.page
  (:use [leafgrabber.clean :only [clean-page]]
        [let-else :only [let?]]
        [clj-http.util :only [url-encode]])
  (:require [leafgrabber.xpath :as x]
            [clojure.string :as s]
            [juiceful.singleton :as sing]
            [clj-http.client :as client]
            [jre2 :as j])
  (:import [com.dappit.Dapper.parser MozillaParser]
           [java.net URL]))


;; Should be a property in /etc/hadoop/conf/mapred-site.xml on the cluster, rather than hardcoding it.
;; See http://groups.google.com/group/cascalog-user/browse_thread/thread/8cd38114e2fa7f8e for a way
;; to read Hadoop properties in Cascalog.
;; The env var is not available on the cluster.
(def mozilla-parser
  (sing/per-thread-singleton
   #(do
      (let [default-mozilla-lib-path "/usr/local/MozillaParser-v-0-3-0/dist/linux/x86_64" ; magic for the cluster
            mozilla-lib-path (or (System/getenv "MOZILLA_LIB_PATH") default-mozilla-lib-path)]
        (MozillaParser/init nil mozilla-lib-path)
        (MozillaParser.)))))

(defn squeeze-crs
  "Remove carriage returns (\r) that come from DOS-formatted html.
   Otherwise dom->xml turns them into &#13; entities."
  [html]
  (s/replace html "\r" ""))

(defn clean-html-tag
  "Remove xmlns attributes from the html tag(s) because they confuse xpath queries.
   Some pages have more than one html tag to accomodate various browsers, e.g. http://www.groupon.com/deals/italian-kitchen-1."
  [html]
  (s/replace html #"<html[^>]*>"
             #(s/replace % #"xmlns(?:\:\w*)?=\"[^\"]*\""
                         "")))

(defn clean-html [html]
  (-> html
      squeeze-crs
      clean-html-tag))

(defn parse-html
  "Clean html with MozillaParser. Returns DOM."
  [html]
  (.parse (mozilla-parser)
          (clean-html html)))

(defn use-public-dcache-server []
  (def dcache-server-prefix
    "http://dcache.factual.com/data?url="))

(defn use-local-dcache-server []
  (def dcache-server-prefix
    "http://localhost:8050/data?url="))

(use-local-dcache-server)

(defn make-dcache-url
  "Make a dcache server URL from 'url'."
  [url]
  (str dcache-server-prefix (url-encode url)))

(defn fetch
  "Fetches a page from 'url' using dcache.
   Returns a Ring-style response map, with :status, :headers, and :body keys.
   Returns nil for exceptional status codes."
  [url]
  (let [dcache-url (make-dcache-url url)]
    (try
      (client/get dcache-url)
      (catch Exception e nil))))

(defn url->page
  "Makes a page map from url. Returns nil if unfetchable.
   If :max-len val is non-nil and length of page body > max-len, skip parsing
   and include :overlimit <len> in the returned page map."
  [url & {:keys [uuid max-len]}]
  (let? [url (s/trim url)
         response (fetch url) :else nil
         body (:body response)
         len (count body)
         page-map (assoc response
                    :url url
                    :uuid uuid
                    :len len)]
    (if (and max-len
             (> len max-len))
      (atom (assoc page-map :overlimit true))
      (let [dom (parse-html body)
            page (atom (assoc page-map :dom dom))]
        (clean-page page)))))

(defn html->page
  "Makes a page map from an html string. The page map will have only :dom and :script keys."
  [html]
  (let [dom (parse-html html)
        page (atom {:dom dom})]
    (clean-page page)))

(defmacro def-get [key creator]
  `(defn ~(symbol (str "get-" (name key)))
     [page#]
     (or (~key @page#)
         (let [val# (~creator page#)]
           (swap! page# assoc ~key val#)
           val#))))