(ns de.blubberquark.microdata.extract
  (:import (org.joda.time.format
	    DateTimeFormatter
	    ISODateTimeFormat))
  (:require [net.cgrand.enlive-html :as html]))

(def *testurls*
  ["http://beta.za-fobi.de/fortbildungen/2011/1/14/25.%20Berliner%20Zahn%C3%A4rztetag/"
   "http://diveintohtml5.org/examples/event-plus-microdata.html"
   "http://diveintohtml5.org/examples/person-plus-microdata.html"])

(def *base* nil)

(defn tagurl [attr]
  (fn [tag]
    (new java.net.URL *base* (attr (:attrs tag)))))

(def timeparser (ISODateTimeFormat/dateOptionalTimeParser))

(def propsrc
  {:meta (fn [tag]
           (let [at (:attrs tag)
                 con (at :content)]
             (if (#{"longitude" "latitude"} (at :itemprop))
               (read-string con)
               con)))
   :audio (tagurl :src) :embed (tagurl :src) :iframe (tagurl :src)
   :img (tagurl :src) :source (tagurl :src)
   :video (tagurl :src) :a (tagurl :href)
   :area  (tagurl :href) :link (tagurl :href)
   :object  (tagurl :data)
   :time #(.parseDateTime timeparser
                          (:datetime (:attrs %)))})

(declare node-data)

(defn extract-prop [node]
  (cond (:itemscope (:attrs node))
	(node-data node)
	(propsrc (:tag node))
	((propsrc (:tag node)) node)
	:else
	(html/text node)))

(defn microdata-items [page]
  (html/select page [(html/attr? :itemscope)]))

(defn node-data [item]
  (let [type  ((tagurl :itemtype) item)
	item (html/at item
	       [(html/attr? :itemprop)]
		      (html/set-attr :interesting-property "true")
	       [:* (html/attr? :itemscope) (html/attr? :itemprop)]
	              (html/remove-attr :interesting-property))
	propnodes (html/select item [:* (html/attr? :interesting-property)])]
    (into {:itemtype type}
	  (concat (for [node propnodes]
		    [(keyword (:itemprop (:attrs node))) (extract-prop node)])))))

(defn get-microdata-from-url [url]
  (let [source (new java.net.URL url)]
    (binding [*base* source]
      (doall (map node-data (microdata-items (html/html-resource source)))))))

(defn get-microdata-from-url-1 [url]
  (let [source (new java.net.URL url)]
    (binding [*base* source]
      (node-data (first (microdata-items (clojure.xml/parse url)))))))
