kornysietsma · January 2, 2021 18:41 · kornysietsma · Jul 7, 2013
diff --git a/wikiparse.clj b/wikiparse.clj
 (ns wikiparse.core
  (:require [clojure.java.io :as io]
            [clojure.data.xml :as xml]
            [clojure.zip :refer [xml-zip]]
            [clojure.data.zip.xml :refer [xml-> xml1-> text]])
  (:import [ org.apache.commons.compress.compressors.bzip2 BZip2CompressorInputStream])
  (:gen-class :main true))

 (defn bz2-reader
  "Returns a streaming Reader for the given compressed BZip2
  file. Use within (with-open)."
  [filename]
  (-> filename io/file io/input-stream BZip2CompressorInputStream. io/reader))

 (defn process-music-artist-page
  "Process a wikipedia page, print the title if it's a musical artist"
  [page]
  (let [z (xml-zip page)
        title (xml1-> z :title text)
        page-text (xml1-> z :revision :text text)]
    (if (#(re-find #"\{\{Infobox musical artist" page-text))
      (println  title))))

 (defn wiki-music-artists
  "parse up to [max] pages from a wikipedia dump, print out those that are musical artists"
  [filename max]
  (with-open [rdr (bz2-reader filename)]
    (dorun (->> (xml/parse rdr)
                :content
                (filter #(= :page (:tag %)))
                (take max)
                (map process-music-artist-page)))))

 (def wikifile "enwiki-latest-pages-articles.xml.bz2")

 (defn -main
  [& args]
  (wiki-music-artists wikifile 100000000))
	(ns wikiparse.core
	(:require [clojure.java.io :as io]
	[clojure.data.xml :as xml]
	[clojure.zip :refer [xml-zip]]
	[clojure.data.zip.xml :refer [xml-> xml1-> text]])
	(:import [ org.apache.commons.compress.compressors.bzip2 BZip2CompressorInputStream])
	(:gen-class :main true))

	(defn bz2-reader
	"Returns a streaming Reader for the given compressed BZip2
	file. Use within (with-open)."
	[filename]
	(-> filename io/file io/input-stream BZip2CompressorInputStream. io/reader))

	(defn process-music-artist-page
	"Process a wikipedia page, print the title if it's a musical artist"
	[page]
	(let [z (xml-zip page)
	title (xml1-> z :title text)
	page-text (xml1-> z :revision :text text)]
	(if (#(re-find #"\{\{Infobox musical artist" page-text))
	(println title))))

	(defn wiki-music-artists
	"parse up to [max] pages from a wikipedia dump, print out those that are musical artists"
	[filename max]
	(with-open [rdr (bz2-reader filename)]
	(dorun (->> (xml/parse rdr)
	:content
	(filter #(= :page (:tag %)))
	(take max)
	(map process-music-artist-page)))))

	(def wikifile "enwiki-latest-pages-articles.xml.bz2")

	(defn -main
	[& args]
	(wiki-music-artists wikifile 100000000))