Skip to content

Instantly share code, notes, and snippets.

Created August 22, 2014 22:46
Show Gist options
  • Save dportabella/f6bee43ab543798813e0 to your computer and use it in GitHub Desktop.
Save dportabella/f6bee43ab543798813e0 to your computer and use it in GitHub Desktop.
this example Scala scripts executes a regex to all files recursively. it uses apache tika UniversalEncodingDetector to filter only text files. it uses a regex to find all lines containing the word "super", except if this word is part of the larger word "superstition" or "supernatural".
import org.apache.tika.detect._
import org.apache.tika.metadata._
import org.apache.tika.mime._
import org.apache.tika.parser.txt._
import resource._
def recursiveListFiles(f: File): List[File] = {
val these = f.listFiles.toList
these.filter(!_.isDirectory) ++ these.filter(_.isDirectory).flatMap(recursiveListFiles)
def isTextFile(file: File) = managed(TikaInputStream.get(file)).map(input => {
val mediaType = new TextDetector().detect(input, null)
mediaType.compareTo(MediaType.TEXT_PLAIN) == 0
def getEncoding(file: File): Charset = managed(TikaInputStream.get(file)).map(input => {
new UniversalEncodingDetector().detect(input, new Metadata())
def read(file: File) = {
io.Source.fromFile(file, getEncoding(file).name)
def processFileLines(file: File, matches: (String) => Boolean) {
val lines = read(file).getLines().zipWithIndex.filter {case (text, _) => matches(text)}
if (lines.nonEmpty) {
println(Console.YELLOW + file + ":" + Console.BLACK)
lines.foreach{ case (text, index) => println(index + ": " + text)}
val dir = "/Users/david/Downloads"
def matches(line: String) = line.matches("(?i).*(?!superstition)(?!supernatural)super.*")
val files = recursiveListFiles(new File(dir)).filter(isTextFile)
files.foreach(file => processFileLines(file, matches))
libraryDependencies += "org.apache.tika" % "tika-core" % "1.5"
libraryDependencies += "org.apache.tika" % "tika-parsers" % "1.5"
libraryDependencies += "com.jsuereth" %% "scala-arm" % "1.3"
download the list of free books from
$ cd ~/Downloads
$ curl | tar -jxf -
$ scala ExampleScalaAck.scala
89: <dcterms:title>Man and Superman: A Comedy and a Philosophy</dcterms:title>
134: <pgterms:alias>Super, Ovando Byron</pgterms:alias>
136: <pgterms:name>Super, O. B. (Ovando Byron)</pgterms:name>
71: <dcterms:title>Astounding Stories of Super-Science January 1930</dcterms:title>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment