Skip to content

Instantly share code, notes, and snippets.

@SethTisue
Last active December 7, 2016 01:42
Show Gist options
  • Save SethTisue/960016 to your computer and use it in GitHub Desktop.
Save SethTisue/960016 to your computer and use it in GitHub Desktop.
parse iTunes library XML, compile stats. shows how to use Scala's XML pull API, by putting some DSL-y goodness on top of it
/// nicer API/DSL for dealing with XMLEventReader
class Reader(source: io.Source) {
import scala.xml.pull._
private val reader =
new XMLEventReader(source) { override val preserveWS = false }
private val it: BufferedIterator[XMLEvent] = reader.buffered
def start() = it.next() match { case EvElemStart(_, s, _, _) => s }
def start(s: String) { it.next() match { case EvElemStart(_, `s`, _, _) => } }
def end() = it.next() match { case EvElemEnd(_, s) => s}
def end(s: String) { it.next() match { case EvElemEnd(_, `s`) => } }
// it would be nice to use BufferedIterator.takeWhile here, but it advances the iterator one too
// far; see https://issues.scala-lang.org/browse/SI-3581 - ST 7/15/10
def text() = Iterator.continually(it.head)
.takeWhile(_.isInstanceOf[EvText])
.collect{case EvText(x) => it.next(); x}
.mkString
def slurp() = (start(), text(), end())._2
def slurp(s: String) = (start(s), text(), end(s))._2
def atEnd = it.head.isInstanceOf[EvElemEnd]
def atEnd(s: String) = it.head match {
case EvElemEnd(_, `s`) => true
case _ => false
}
def stop() { reader.stop() }
}
/// read the XML file
case class Track(name: String, artist: String, album: String, time: Long, plays: Int, lastPlayed: Long)
def getTracks(source: io.Source): Traversable[Track] =
new Traversable[Track] {
override def foreach[T](fn: Track => T) {
val reader = new Reader(source)
import reader._
start("plist")
start("dict")
while(slurp("key") != "Tracks")
slurp()
start("dict")
while(!atEnd) {
slurp("key")
start("dict")
val entries = {
val temp = collection.mutable.Map[String, String]()
while(!atEnd("dict"))
temp += slurp("key") -> slurp()
end("dict")
temp.toMap
}
try
fn(Track(
name = entries("Name"),
artist = entries.getOrElse("Sort Artist", entries("Artist")),
album = entries.getOrElse("Album", ""),
time = entries("Total Time").toLong,
plays = entries.getOrElse("Play Count", "0").toInt,
lastPlayed = entries.getOrElse("Play Date", "0").toLong))
catch {
case _: java.util.NoSuchElementException =>
if (!skippable(entries))
println(entries)
}
}
reader.stop()
}
}
def skippable(entries: Map[String, String]): Boolean =
!entries.isDefinedAt("Kind") ||
entries("Kind").endsWith(" app") ||
entries("Kind").endsWith(" book") ||
entries("Kind") == "Book" ||
entries("Genre") == "Podcast"
def read(source: io.Source) = {
def newMap = collection.mutable.Map[String, Long]()
val (trackNames, artists, artistLengths, albums, albumLengths, playDates) = (newMap, newMap, newMap, newMap, newMap, newMap)
def loop() {
for(track <- getTracks(source)) {
import track._
trackNames(artist + " - " + name) = time * plays
artists(artist) = time * plays + artists.getOrElse(artist, 0L)
artistLengths(artist) = time + artistLengths.getOrElse(artist, 0L)
val artistAlbum = artist + " - " + album
albums(artistAlbum) = time * plays + albums.getOrElse(artistAlbum, 0L)
albumLengths(artistAlbum) = time + albumLengths.getOrElse(artistAlbum, 0L)
if(!playDates.isDefinedAt(artist) || playDates(artist) < lastPlayed)
playDates(artist) = lastPlayed
// if(artists.size >= 200) return
}
}
loop()
(trackNames.toMap, artists.toMap, artistLengths.toMap, albums.toMap, albumLengths.toMap, playDates.toMap)
}
/// reports
def artistReport1(artists: Map[String, Long]) {
println("ARTISTS (TOTAL LISTENING TIME)")
val totalTime = artists.map(_._2).sum
artists.toList.sortBy(_._2).reverse.zipWithIndex.foreach{
case ((artist, time), index) =>
println((index + 1) + ". " + artist + " - " + (100000 * time / totalTime) / 1000.0 + "%")
}
}
def artistReport2(artists: Map[String, Long], artistLengths: Map[String, Long]) {
println("ARTISTS (TOTAL LISTENING TIME VS. LENGTH, 20 MIN MIN)")
artistLengths.toList.filter(_._2 > 1200000)
.map{case (artist, time) => (artist, artists(artist) / time.toDouble)}
.sortBy(_._2).reverse.map(_._1).zipWithIndex.foreach{
case (artist, index) =>
println((index + 1) + ". " + artist)
}
}
def albumReport1(albums: Map[String, Long]) {
println("ALBUMS (TOTAL LISTENING TIME)")
albums.toList.filter(_._2 > 0)
.sortWith(_._2 > _._2).map(_._1).zipWithIndex.foreach{
case (album, index) =>
println((index + 1) + ". " + album)
}
}
def albumReport2(albums: Map[String, Long], albumLengths: Map[String, Long]) {
println("ALL ALBUMS (LISTENING TIME VS. LENGTH)")
albums.toList.filter(_._2 > 0)
.map{case (album, time) => (album, time.toDouble / albumLengths(album))}
.sortBy(_._2).reverse.map(_._1).zipWithIndex.foreach{
case (album, index) =>
println((index + 1) + ". " + album)
}
}
def albumReport3(albums: Map[String, Long], albumLengths: Map[String, Long]) {
println("FULL ALBUMS (LISTENING TIME VS. LENGTH)")
albums.toList.filter(a => a._2 > 0 && albumLengths(a._1) > 600000)
.map{case (album, time) => (album, time.toDouble / albumLengths(album))}
.sortBy(_._2).reverse.map(_._1).zipWithIndex.foreach{
case (album, index) =>
println((index + 1) + ". " + album)
}
}
def trackReport(trackNames: Map[String, Long]) {
println("TRACKS (TOTAL LISTENING TIME)")
trackNames.toList.filter(_._2 > 0)
.sortBy(_._2).reverse.map(_._1).take(1000).zipWithIndex.foreach{
case (track, index) =>
println((index + 1) + ". " + track)
}
}
def lastPlayedReport(playDates: Map[String, Long]) {
println("RECENTLY LISTENED ARTISTS")
playDates.toList.filter(_._2 > 0)
.sortBy(_._2).reverse.map(_._1).zipWithIndex.foreach{
case (artist, index) =>
println((index + 1) + ". " + artist)
}
println()
println("NEVER LISTENED ARTISTS")
playDates.toList.filter(_._2 == 0).map(_._1).sorted.foreach{println}
}
/// do it!
val path = "/Users/tisue/Dropbox/Archive/iTunes/iTunes Library.xml"
val (trackNames, artists, artistLengths, albums, albumLengths, playDates) =
read(io.Source.fromFile(path))
artistReport1(artists)
println()
artistReport2(artists, artistLengths)
println()
albumReport1(albums)
println()
albumReport2(albums, albumLengths)
println()
albumReport3(albums, albumLengths)
println()
trackReport(trackNames)
println()
lastPlayedReport(playDates)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment