Skip to content

Instantly share code, notes, and snippets.

@rsimon
Created December 4, 2013 09:38

Revisions

  1. rsimon created this gist Dec 4, 2013.
    43 changes: 43 additions & 0 deletions gistfile1.scala
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,43 @@
    import scala.xml.XML
    import java.io.FileWriter
    import scala.xml.transform.RewriteRule
    import scala.xml.Node
    import scala.xml.NodeSeq
    import scala.xml.Elem
    import scala.xml.transform.RuleTransformer
    import scala.xml.Text

    object TEI extends App {

    val xml = XML.loadFile("/home/simonr/Workspaces/pelagios/pelagios3-scripts/tei/Perseus_text_1999.02.0137.xml")

    val books = xml \\ "div1"

    books.foreach(book => {
    val bookNumber = (book \ "@n").text.toInt

    // We're only interested in books 3 - 6 (the geographical ones)
    if (bookNumber > 2 && bookNumber < 7) {
    (book \\ "div2").foreach(chapter => {
    val writer = new FileWriter("book" + bookNumber + "_chapter" + (chapter \ "@n").text + ".txt")

    // Remove 'note' tags
    val removeNotes = new RewriteRule {
    override def transform(n: Node): NodeSeq = n match {
    case e: Elem if (e.label.equals("note")) => Text("")
    case e: Elem if (e.label.equals("head")) => Text(e.text + "\n\n")
    case e: Text => Text(e.text.replace("\n", " "))
    case n => n
    }
    }

    val cleaned = new RuleTransformer(removeNotes).transform(chapter).text.replaceAll(" +", " ").replace("\n ", "\n").trim()

    writer.write(cleaned)
    writer.flush()
    writer.close()
    })
    }
    })

    }