Skip to content

Instantly share code, notes, and snippets.

@oliverdaff
Created October 31, 2011 22:50
Show Gist options
  • Save oliverdaff/1329291 to your computer and use it in GitHub Desktop.
Save oliverdaff/1329291 to your computer and use it in GitHub Desktop.
Parse Gutenberg using scala pull parser
import java.io.{FileInputStream, BufferedInputStream}
import scala.io.Source
import scala.xml.pull.{XMLEventReader, EvElemStart, EvElemEnd, EvText}
object GutenbergCrunch
{
private def parseBook( parser : XMLEventReader ) : (String, String, String) =
{
var title = ""
var creator = ""
var friendlytitle = ""
var done = false
while ( parser.hasNext && !done )
{
parser.next match
{
case EvElemStart(_, "title", _, _ ) =>
{
title = getText( parser, "title" )
//System.out.println("Title " + title)
}
case EvElemStart(_, "creator", _, _) =>
{
creator = getText( parser, "creator" )
//System.out.println("Creator " + creator)
}
case EvElemStart(_, "friendlytitle", _, _ ) =>
{
friendlytitle = getText( parser, "friendlytitle" )
//System.out.println("Friendly Title " + friendlytitle)
}
case EvElemEnd("pgterms", "etext") =>
{
done = true
}
case _ =>
}
}
return (title, creator, friendlytitle)
}
private def getText( parser : XMLEventReader, inTag : String ) : String =
{
//System.out.println("getText for" + inTag)
var fullText = new StringBuffer()
var done = false
while ( parser.hasNext && !done )
{
parser.next match
{
case EvElemEnd(_, tagName ) if tagName == inTag =>
{
done = true
}
case EvElemEnd(_, tagName ) =>
case EvText( text ) =>
{
fullText.append( text )
}
case _ =>
}
}
return fullText.toString()
}
def main( args : Array[String] )
{
var books = List[Book]()
val fin = new FileInputStream( "src/main/resources/catalog.rdf" )
val in = new BufferedInputStream(fin)
var count = 0
try
{
val source = Source.fromInputStream( in )
val parser = new XMLEventReader(source)
while (parser.hasNext)
{
parser.next match
{
case EvElemStart(_, "etext", attrs, _) =>
{
val (title, creator, friendlytitle) = parseBook( parser )
val book = new Book(title, creator, friendlytitle)
books = book :: books
}
case _ =>
}
}
}
finally
{
fin.close()
}
println( "Finished found " + books.length)
books map tabbedBook foreach println
}
def tabbedBook(book: Book) = book.title + "\t" + book.creator + "\t" + book.friendlyTitle
}
case class Book(title:String, creator:String, friendlyTitle:String)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment