Created
October 31, 2011 22:50
-
-
Save oliverdaff/1329291 to your computer and use it in GitHub Desktop.
Parse Gutenberg using scala pull parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.{FileInputStream, BufferedInputStream} | |
import scala.io.Source | |
import scala.xml.pull.{XMLEventReader, EvElemStart, EvElemEnd, EvText} | |
object GutenbergCrunch | |
{ | |
private def parseBook( parser : XMLEventReader ) : (String, String, String) = | |
{ | |
var title = "" | |
var creator = "" | |
var friendlytitle = "" | |
var done = false | |
while ( parser.hasNext && !done ) | |
{ | |
parser.next match | |
{ | |
case EvElemStart(_, "title", _, _ ) => | |
{ | |
title = getText( parser, "title" ) | |
//System.out.println("Title " + title) | |
} | |
case EvElemStart(_, "creator", _, _) => | |
{ | |
creator = getText( parser, "creator" ) | |
//System.out.println("Creator " + creator) | |
} | |
case EvElemStart(_, "friendlytitle", _, _ ) => | |
{ | |
friendlytitle = getText( parser, "friendlytitle" ) | |
//System.out.println("Friendly Title " + friendlytitle) | |
} | |
case EvElemEnd("pgterms", "etext") => | |
{ | |
done = true | |
} | |
case _ => | |
} | |
} | |
return (title, creator, friendlytitle) | |
} | |
private def getText( parser : XMLEventReader, inTag : String ) : String = | |
{ | |
//System.out.println("getText for" + inTag) | |
var fullText = new StringBuffer() | |
var done = false | |
while ( parser.hasNext && !done ) | |
{ | |
parser.next match | |
{ | |
case EvElemEnd(_, tagName ) if tagName == inTag => | |
{ | |
done = true | |
} | |
case EvElemEnd(_, tagName ) => | |
case EvText( text ) => | |
{ | |
fullText.append( text ) | |
} | |
case _ => | |
} | |
} | |
return fullText.toString() | |
} | |
def main( args : Array[String] ) | |
{ | |
var books = List[Book]() | |
val fin = new FileInputStream( "src/main/resources/catalog.rdf" ) | |
val in = new BufferedInputStream(fin) | |
var count = 0 | |
try | |
{ | |
val source = Source.fromInputStream( in ) | |
val parser = new XMLEventReader(source) | |
while (parser.hasNext) | |
{ | |
parser.next match | |
{ | |
case EvElemStart(_, "etext", attrs, _) => | |
{ | |
val (title, creator, friendlytitle) = parseBook( parser ) | |
val book = new Book(title, creator, friendlytitle) | |
books = book :: books | |
} | |
case _ => | |
} | |
} | |
} | |
finally | |
{ | |
fin.close() | |
} | |
println( "Finished found " + books.length) | |
books map tabbedBook foreach println | |
} | |
def tabbedBook(book: Book) = book.title + "\t" + book.creator + "\t" + book.friendlyTitle | |
} | |
case class Book(title:String, creator:String, friendlyTitle:String) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment