Skip to content

Instantly share code, notes, and snippets.

@eklitzke
Created June 29, 2012 01:33
Show Gist options
  • Select an option

  • Save eklitzke/3015151 to your computer and use it in GitHub Desktop.

Select an option

Save eklitzke/3015151 to your computer and use it in GitHub Desktop.
(ns clojure-crawler.core
(:require [cheshire.core :as cheshire]
[clj-http.client :as client]
[clojure.tools.cli :as cli]))
(defn- read-forms
"Read all of the forms from a file, passed by name."
[file]
(let [reader-read-forms
(fn [rdr]
(let [inner-read (fn func [rslt]
(try
(func (conj rslt (read rdr)))
(catch Exception e rslt)))]
(inner-read [])))]
(with-open [r (java.io.PushbackReader.
(clojure.java.io/reader file))]
(binding [*read-eval* false]
(reader-read-forms r)))))
(defn- take-until
"Take elements from seq until stop? is true."
[stop? seq]
(let [inner (fn [rslt s]
(if (empty? s) rslt
(let [hd (first s)]
(if (stop? hd)
rslt (recur (conj rslt hd) (rest s))))))]
(inner [] seq)))
(defn- parse-form
"Helper for parse-forms"
[form kind]
(let [is-persistent #(or (instance? clojure.lang.PersistentVector %)
(instance? clojure.lang.PersistentList %))
decl (take-until is-persistent form)
num-attrs (count decl)
impl (drop num-attrs form)
first-impl (first impl)
parse-templ {:kind kind
:name (first decl)
:doc ""
:args []
:attrs {}}
noargs-parse (merge parse-templ
(case num-attrs
1 {}
2 (let [snd (second decl)]
(if (instance? String snd)
{:doc snd}
{:attrs snd}))
3 {:doc (second decl)
:attrs (nth decl 2)}))
make-args-parse #(merge noargs-parse {:args %})]
(if (instance? clojure.lang.PersistentVector first-impl)
[(make-args-parse first-impl)]
(map (comp make-args-parse first) impl))))
(defn- get-last-definitions-helper
"Helper for get-last definition -- the list of forms should already
be reversed."
[seen result remaining]
(if (empty? remaining)
result
(let [form (first remaining)
name (form :name)
remaining* (rest remaining)]
(if (nil? (seen name))
(recur (conj seen name) (conj result form) remaining*)
(recur seen result remaining*)))))
(defn- get-last-definitions
"Filter a list of forms and get only the last instance for each
name. This is required because Clojure allows redefintion of a
defn/defmacro, and therefore we only want to use the last form
defined.
This implementation reverses the list of forms passed in, and
therefore forces the evaluation of the forms seq."
[forms]
(get-last-definitions-helper
#{} [] (reverse forms)))
(defn- not-private?
"Tests if a function is private, based on its attributes."
[parse]
(not (true? ((parse :attrs) :private))))
(defn parse-forms
"Parse a sequence of forms, and return maps of parse data.
Currently we only allow the parsing of defn and defmacro forms. This
is easily extended -- for now, we have the restriction because those
are the only forms we are interested in."
[forms filename]
{:filename filename
:lang "clojure"
:parses (filter #(and (not (nil? %))
(not-private? %))
(flatten
(map (fn [lst]
(if (= (type lst) clojure.lang.PersistentList)
(let [hd (first lst)
tl (rest lst)]
(case hd
defn (parse-form tl :defn)
defmacro (parse-form tl :defmacro)
nil))))
forms)))})
(defn index-file
"Index a file, by parsing its top level and looking for interesting forms."
[file]
(parse-forms (read-forms file) (.getPath (.toURI file))))
(defn- get-matching-files
[directories pattern]
(let [dir-files (map clojure.java.io/file directories)
file-objs (apply concat (map file-seq dir-files))]
(filter #(re-matches pattern (.getName %)) file-objs)))
#_(defn index-directories
"Index the contents of the dirs listed in the seq 'directories'.
This function uses multiple threads, by sending each file to be indexed to
its own agent (by way of the index-file function)."
[directories pattern]
(let [agents (map #(send (agent %) index-file)
(get-matching-files directories pattern))]
(apply await agents)
(flatten (map deref agents))))
(defn index-directories-sequentially
"Like index-directories, but doesn't use agents (and hence runs in a single
thread)."
[directories pattern]
(map index-file (get-matching-files directories pattern)))
(defn upload-results
[results endpoint api-key]
(doseq [file-results results]
(let [body (merge file-results {:parses (cheshire/generate-string
(file-results :parses))
:api-key api-key})]
(client/post endpoint body))))
(defn -main [& argv]
(let [[options args banner]
(cli/cli argv
["-h" "--help" "Show help" :default false :flag true]
["--endpoint" :default "http://codesear.ch/api/upload"]
["--api-key" :default nil]
["--test" "Run in test mode (don't upload parses)"
:default false :flag true])]
(when (options :help)
(println banner)
(System/exit 0))
(when (empty? args)
(println "No directories specified for indexing!")
(System/exit 1))
(let [results (index-directories-sequentially args #"[^.].*\.clj$")
num-results (reduce + (map (comp count :parses) results))]
(if (options :test)
(println (str "Parsed " num-results " forms, exiting..."))
(do
(println (str "Parsed " num-results " forms, uploading..."))
(upload-results results (options :endpoint) (options :api-key))
(println "Successfully uploaded results")))))
(System/exit 0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment