;;;; 使用jsoup解析点评商户HTML页面
(ns com.kahui.spiders.dianping.jsoup-parser
  (:require [clojure.string :as string]
            [com.kahui.spiders.tools.utils :as utils]
            [com.kahui.spiders.dianping.tools :as tools])
  (:import [org.jsoup Jsoup]))

(def ^{:dynamic true} *html-content* nil)

(defn parse-nodes-from-html
  "解析HTML数据,返回Document"
  [^String html-data]
  (if-let [html-data (not-empty html-data)]
    (Jsoup/parse html-data)))

(defn- first-node-text
  [t]
  (when (seq t)
    (if-let [t (first t)]
      (-> t .text .trim))))

(defn- last-node-text
  [t]
  (when (seq t)
    (if-let [t (last t)]
      (-> t .text .trim))))

(defn- nodes-text
  [t]
  (when (seq t)
    (string/join "," (map #(if-let [s %]
                             (.trim (.text s))) t))))

(defn- try-match
  ([t match-list] (try-match t match-list first-node-text))
  ([t match-list val-fn]
    (when (seq t)
      (let [t (first t)
            selected (some #(seq (.select t %)) match-list)]
        (val-fn selected)))))

;;; Version 1 :适用于2014-03之前爬取的页面

;点评的主类别
(def category-pattern-v1 {:sel "li.this.mi a strong"
                          :val-fn first-node-text})
;点评各频道的特殊识别模式
(def category-patterns-v1
  {
    "结婚" {
           ;卡标识
           :card {:sel "li.i-promo-vip.ip-item div.info strong"
                  :val-fn first-node-text}



           ;分店
           :branch-office {:sel "a.subbranch"
                           :val-fn first-node-text}
           ;商户介绍
           :intruction {:sel "#shop-intro dd"
                        :val-fn first-node-text}
           ;图片数
           :photo-count {:sel "div.shop-gallery a[href$=photos]"
                         :val-fn (fn [t]
                                   (some #(if-let [ret (utils/re-match-group #"(\d+).+" (.text %) 1)]
                                            ret) t))}
           ;电话
           :telphone {:sel "dd.shop-info-content [itemprop=tel]"
                      :val-fn first-node-text}
           ;等级
           :grade {:sel "div.comment-rst span.item-rank-rst"
                   :val-fn (fn [t]
                             (if-let [t (first t)] (.attr t "title")))}
           ;更新时间
           :update-time {:sel "span.note"
                         :val-fn (fn [t] (if-let [t t]
                                           (let [content (.text t)]
                                             (utils/re-match-group #"系统在(.+)最后更新" content 1))))}
           ;人均消费
           :average {:sel "div.comment-rst dl dd"
                     :val-fn first-node-text}
           ;评论数
           :comment-count {:sel "a[href=#user-review-info] span[itemprop=count]" :val-fn first-node-text}
           ;营业时间
           :time {:sel "dt:contains(营业时间) ~ dd span" :val-fn first-node-text}
           ;品牌ID
           :brand-id {:sel "textarea.J_auto-load"
                      :val-fn (fn [t] (if-let [t (first t)]
                                        (if-let [html-value (.val t)]
                                          (let [html-value-e (Jsoup/parse html-value)
                                                a-href (.select html-value-e "span.l-addSub a")
                                                href (if a-href (.attr a-href "href"))]
                                            (utils/re-match-group #"id=(\d+)" href 1)))))}
           }})

;主识别模式
(def patterns-v1 {
                   ;主类别
                   :category category-pattern-v1

                   ;店名
                   :name {:sel "body div.shop-name h1.shop-title[itemprop=name itemreviewed]"
                          :val-fn first-node-text}

                   ;分类1
                   :sub-category1 {:sel "div.breadcrumb a[itemprop=url][onclick~=pageTracker._trackPageview\\('dp_(test_)?(new)?shop_daohang_fenlei1] span.bread-name"
                                   :val-fn first-node-text}
                   ;分类2
                   :sub-category2 {:sel "div.breadcrumb a[itemprop=url][onclick~=pageTracker._trackPageview\\('dp_(test_)?(new)?shop_daohang_fenlei2] span.bread-name"
                                   :val-fn first-node-text}

                   ;行政区域
                   :boroughs {:sel "span.region[itemprop=locality region]"
                              :val-fn first-node-text}

                   ;地址
                   :address {:sel "span[itemprop=street-address]"
                             :val-fn first-node-text}

                   ;电话
                   :telphone {:sel "span.call[itemprop=tel]"
                              :val-fn nodes-text}
                   ;匹配商区
                   :business-district {:sel "a[itemprop=url][onclick~=^pageTracker._trackPageview\\('dp_(test_)?(new)?shop_daohang_shangqu] span.bread-name"
                                       :val-fn first-node-text}

                   ;注意事项,多数页面中没有
                   :notice {:sel "span.notice"
                            :val-fn first-node-text}

                   ;简介字段,取得包含简介的li节点下面的文字节点,内容在最后一个text node
                   :intruction {:sel "div.desc-list ul li:has(em:contains(简介))"
                                :val-fn (fn [t] (if-let [t (first t)] (last-node-text (.textNodes t))))}

                   ;推荐内容 a节点
                   :recommendation {:sel "li a[data-price][data-rate]"
                                    :val-fn nodes-text}

                   ;营业时间,内容在第一个text-node,解析的时候分隔符可能是  "-", "至", "～", "–", "--", "——", "到", "~"
                   :time {:sel "div.desc-list ul li:has(em:contains(营业时间)) span.J_full-cont"
                          :val-fn first-node-text}

                   ;卡标识
                   :card {:sel "a.promo[data-promotype=card]"
                          :val-fn first-node-text}

                   ;等级
                   :grade {:sel "div.comment-rst[itemprop=rating] span.item-rank-rst"
                           :val-fn (fn [t]
                                     (if-let [t (first t)] (.attr t "title")))}

                   ;人均消费
                   :average {:sel "div.rst-taste span:contains(人均) strong"
                             :val-fn first-node-text}

                   ;团购
                   :team-buying {:sel "a.promo[data-promotype=deal]:has(i.group,span.price)"
                                 :val-fn first-node-text
                                 }

                   ;在线订座
                   :revise {:sel "div.promo-list a[data-promotype=booking]:has(i.irevise)"
                            :val-fn first-node-text}

                   ;评论数量
                   :comment-count {:sel "li[data-name=all] span em"
                                   :val-fn (fn [t]
                                             (if-let [t (first t)]
                                               (utils/re-match-group #"\((.+)\)" (.text t) 1)))}


                   ;图片数
                   :photo-count {:sel "div.pic div.pic-name a"
                                 :val-fn (fn [t]
                                           (some #(if-let [ret (utils/re-match-group #"全部(\d+)张" (.text %) 1)]
                                                    ret) t))}

                   ;特色
                   :feature {:sel "a.feature.J_feature-btn"
                             :val-fn first-node-text}

                   ;分店
                   :branch-office {:sel "a.ep-trigger[title=其它分店]"
                                   :val-fn first-node-text}

                   ;品牌ID
                   :brand-id {:sel "a:contains(添加分店)"
                              :val-fn (fn [t] (if-let [html-content *html-content*]
                                                (if-let [brand-id (utils/re-match-group #"id=(\d+)\".*>添加分店</a>" html-content 1)]
                                                  brand-id
                                                  )))}

                   ;更新时间
                   :update-time {:sel "div.desc-list"
                                 :val-fn (fn [t] (if-let [t t]
                                                   (let [content (string/join "" t)]
                                                     (utils/re-match-group #"系统在(.+)最后更新" content 1))))}

                   ;是否关闭
                   :close {:sel "div.suspend-receipts"
                           :val-fn (fn [t] (if (some #(utils/re-match-group #"(该商户已关闭)" (.text %) 1) t) true false))}

                   ;点评是否暂停收录
                   :pause {:sel "div.msg-pause"
                           :val-fn (fn [t] (if (some #(utils/re-match-group #"(暂停)" (.text %) 1) t) true false))}

                   ;面包屑
                   :bread-crumb {:sel "div.breadcrumb b a"
                                 :val-fn (fn [t]
                                           (map (fn [e]
                                                  {:url (.attr e "href") :title (.text e)}) t))}

                   ;商户认证
                   :authentication {:sel "a.v-shop-btn"
                   :val-fn first-node-text}

                   ;script data
                   :script-data {:sel "script"
                                 :val-fn (fn [t]
                                           (tools/parse-script (map #(.html %) t)))}
                   })

;;; Version 2: 使用于2013-03之后爬取的页面


(def category-patterns-v2
  {
    "结婚" {
           ;地址
           :address {:sel "body"
                     :val-fn (fn [t]
                               (some identity [(try-match t ["span[itemprop=street-address]"])
                                               (try-match t ["div.shop-addr span"] (fn [t]
                                                                                     (if-let [t (first t)] (.attr t "title"))))]))}
           ;人均消费
           :average {:sel "body"
                     :val-fn (fn [t]
                               (try-match t ["div.comment-rst dl dd" "div.comment-rst span:has(em:contains(费用)) strong"]))}
           ;卡标识
           :card {:sel "li.i-promo-vip.ip-item div.info strong"
                  :val-fn first-node-text}

           ;分店
           :branch-office {:sel "a.J_btnBranch"
                           :val-fn first-node-text}
           ;商户介绍
           :intruction {:sel "#shop-intro dd"
                        :val-fn first-node-text}
           ;图片数
           :photo-count {:sel "a.more[href$=/photos/album]"
                         :val-fn (fn [t]
                                   (some #(if-let [ret (utils/re-match-group #"(\d+).+" (.text %) 1)]
                                            ret) t))}
           ;行政区域
           :boroughs {:sel "body"
                      :val-fn (fn [t]
                                (try-match t ["span.region[itemprop=locality region]" "div.shop-addr span a.region"]))
                      }

           ;品牌ID
           :brand-id {:sel "a:contains(添加分店)"
                      :val-fn (fn [t] (if-let [html-content *html-content*]
                                        (if-let [brand-id (utils/re-match-group #"id=(\d+)\".*>添加分店</a>" html-content 1)]
                                          brand-id
                                          )))}

           ;电话
           :telphone {:sel "body"
                      :val-fn (fn [t]
                                (try-match t ["dd.shop-info-content [itemprop=tel]" "div.tel:has(i.icon-tel) div.t-con span"]))
                      }
           ;等级
           :grade {:sel "div.comment-rst span.item-rank-rst"
                   :val-fn (fn [t]
                             (if-let [t (first t)] (.attr t "title")))}
           ;更新时间
           :update-time {:sel "span.note"
                         :val-fn (fn [t] (if-let [t t]
                                           (let [content (.text t)]
                                             (utils/re-match-group #"系统在(.+)最后更新" content 1))))}
           ;评论数
           :comment-count {:sel "a[href~=#(t-comment|user-review-info)] span[itemprop=count]" :val-fn first-node-text}

           ;营业时间
           :time {:sel "body"
                  :val-fn (fn [t]
                            (try-match t ["dt:contains(营业时间) ~ dd span" "em:contains(营业时间) ~ span"]))
                  }
           }})

(def patterns-v2 (merge patterns-v1 {
                                      ;团购
                                      :team-buying {:sel "a.promo[data-type=deal]:has(i.group,strong.price)"
                                                    :val-fn first-node-text
                                                    }

                                      ;卡标识
                                      :card {:sel "a.promo[data-type=card]"
                                             :val-fn first-node-text}

                                      ;在线订座
                                      :revise {:sel "div.promo-list a[data-type=booking]:has(i.irevise)"
                                               :val-fn first-node-text}
                                      }))


;识别点评频道的模式版本
(def version-categroy-pattern {"v1" category-pattern-v1
                               "v2" category-pattern-v1})
;页面主匹配模式的版本
(def version-main-patterns {"v1" patterns-v1
                            "v2" patterns-v2})
;频道自己的匹配模式版本
(def version-categroy-patterns {"v1" category-patterns-v1
                                "v2" category-patterns-v2})
(defn parse
  "从HTML节点中解析数据"
  ([html-content html-doc key] (parse html-content html-doc key patterns-v1 nil))
  ([html-content html-doc key category main-patterns c-patterns]
    (binding [*html-content* html-content]
      (let [pattern (some #(key %) [(if-let [c-patterns c-patterns] (.get c-patterns category)) main-patterns])
            sel (:sel pattern)
            val-fn (:val-fn pattern)
            selected (.select html-doc sel)
            val (val-fn selected)]
        {:selected selected :val val}))))

(defn parse-all
  "解析整个HTML Doc"
  ([html-content] (parse-all html-content patterns-v1))
  ([html-content patterns] (parse-all html-content patterns category-pattern-v1 category-patterns-v1))
  ([html-content patterns category-pattern category-patterns]
    (if-let [html-doc (parse-nodes-from-html html-content)]
      (let [category ((:val-fn category-pattern) (.select html-doc (:sel category-pattern)))]
        (when (and (not= nil html-doc) (not= nil patterns))
          (reduce merge {} (map (fn [t]
                                  (let [[k v] t
                                        selected (parse html-content html-doc k category patterns category-patterns)]
                                    {k (:val selected)}))
                             patterns)))))))

(defn parse-all-by-version
  "解析指定版本的HTML Doc"
  ([html-content] (parse-all html-content :1))
  ([html-content version]
    (if-let [html-doc (parse-nodes-from-html html-content)]
      (let [real-main-patterns (version-main-patterns version)
            real-category-pattern (version-categroy-pattern version)
            real-category-patterns (version-categroy-patterns version)]
        (parse-all html-content real-main-patterns real-category-pattern real-category-patterns)))))
