-
-
Save bcambel/f26028f0e8b02c617e76 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (ns wikiparse.core | |
| (:require [clojure.java.io :as io] | |
| [clojure.data.xml :as xml] | |
| [clojure.zip :refer [xml-zip]] | |
| [clojure.data.zip.xml :refer [xml-> xml1-> text]]) | |
| (:import [ org.apache.commons.compress.compressors.bzip2 BZip2CompressorInputStream]) | |
| (:gen-class :main true)) | |
| (defn bz2-reader | |
| "Returns a streaming Reader for the given compressed BZip2 | |
| file. Use within (with-open)." | |
| [filename] | |
| (-> filename io/file io/input-stream BZip2CompressorInputStream. io/reader)) | |
| (defn process-music-artist-page | |
| "Process a wikipedia page, print the title if it's a musical artist" | |
| [page] | |
| (let [z (xml-zip page) | |
| title (xml1-> z :title text) | |
| page-text (xml1-> z :revision :text text)] | |
| (if (#(re-find #"\{\{Infobox musical artist" page-text)) | |
| (println title)))) | |
| (defn wiki-music-artists | |
| "parse up to [max] pages from a wikipedia dump, print out those that are musical artists" | |
| [filename max] | |
| (with-open [rdr (bz2-reader filename)] | |
| (dorun (->> (xml/parse rdr) | |
| :content | |
| (filter #(= :page (:tag %))) | |
| (take max) | |
| (map process-music-artist-page))))) | |
| (def wikifile "enwiki-latest-pages-articles.xml.bz2") | |
| (defn -main | |
| [& args] | |
| (wiki-music-artists wikifile 100000000)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment