(ns matteoredaelli.surf-http
  (:import (org.jsoup Jsoup))
  (:require [matteoredaelli.uri-ext :as uri-ext])
  )

(defn get-connection [address]
  (let [conn (Jsoup/connect address)]
    (.header conn "a" "b")
    conn))

(defn get-page-at-address [address]
  (let [conn (get-connection address)
        resp (->> conn
                  .execute)]
    resp))

(defn extract-link-data [link]
  (let [address (.attr link "abs:href")]
     address))

(defn extract-head-meta-content [soup name value]
  (let [filter (format "head > meta[%s=\"%s\"]" name value)]
    (.attr (.select soup filter) "content")))

(defn extract-element-text [soup element]
  (.text (.select soup element)))

(defn extract-links [soup]
  (let [links (.select soup "a")]
    (distinct (mapv extract-link-data links))))

(defn surf [address]
  ; TODO: extract logos img > src *logo*
  (let [resp (get-page-at-address address)
        soup (.parse resp)
        body-headers (extract-element-text soup "h1,h2")
        body-links (extract-links soup)
        emails (uri-ext/filter-email-links body-links)
        body-web-links (->> body-links
                            uri-ext/remove-empty-links
                            uri-ext/remove-links-with-fragment
                            uri-ext/remove-links-with-mailto)
        body-external-links (uri-ext/filter-external-links body-web-links address)
        body-internal-links (uri-ext/filter-internal-links body-web-links address)
        head-description [(extract-head-meta-content soup "name" "description")
                          (extract-head-meta-content soup "property" "og:description")]
        head-keywords (extract-head-meta-content soup "name" "keywords")
        head-title [(.title soup)
                    (extract-head-meta-content soup "property" "og:title")]]
    {:http-headers (.headers resp)
     :base_url address
     :body-headers body-headers
     :body-external-links body-external-links
     :body-internal-links body-internal-links
     :head-title head-title
     :head-description head-description
     :head-keywords head-keywords
     :emails emails
   }))
