Created
August 22, 2014 22:46
-
-
Save dportabella/f6bee43ab543798813e0 to your computer and use it in GitHub Desktop.
this example Scala scripts executes a regex to all files recursively. it uses apache tika UniversalEncodingDetector to filter only text files. it uses a regex to find all lines containing the word "super", except if this word is part of the larger word "superstition" or "supernatural".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File | |
import org.apache.tika.detect._ | |
import org.apache.tika.metadata._ | |
import org.apache.tika.mime._ | |
import org.apache.tika.io._ | |
import org.apache.tika.parser.txt._ | |
import resource._ | |
def recursiveListFiles(f: File): List[File] = { | |
val these = f.listFiles.toList | |
these.filter(!_.isDirectory) ++ these.filter(_.isDirectory).flatMap(recursiveListFiles) | |
} | |
def isTextFile(file: File) = managed(TikaInputStream.get(file)).map(input => { | |
val mediaType = new TextDetector().detect(input, null) | |
mediaType.compareTo(MediaType.TEXT_PLAIN) == 0 | |
}).opt.get | |
def getEncoding(file: File): Charset = managed(TikaInputStream.get(file)).map(input => { | |
new UniversalEncodingDetector().detect(input, new Metadata()) | |
}).opt.get | |
def read(file: File) = { | |
io.Source.fromFile(file, getEncoding(file).name) | |
} | |
def processFileLines(file: File, matches: (String) => Boolean) { | |
val lines = read(file).getLines().zipWithIndex.filter {case (text, _) => matches(text)} | |
if (lines.nonEmpty) { | |
println(Console.YELLOW + file + ":" + Console.BLACK) | |
lines.foreach{ case (text, index) => println(index + ": " + text)} | |
println() | |
} | |
} | |
val dir = "/Users/david/Downloads" | |
def matches(line: String) = line.matches("(?i).*(?!superstition)(?!supernatural)super.*") | |
val files = recursiveListFiles(new File(dir)).filter(isTextFile) | |
files.foreach(file => processFileLines(file, matches)) | |
/* | |
sbt: | |
libraryDependencies += "org.apache.tika" % "tika-core" % "1.5" | |
libraryDependencies += "org.apache.tika" % "tika-parsers" % "1.5" | |
libraryDependencies += "com.jsuereth" %% "scala-arm" % "1.3" | |
download the list of free books from gutenberg.org | |
$ cd ~/Downloads | |
$ curl http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2 | tar -jxf - | |
$ scala ExampleScalaAck.scala | |
/Users/david/Downloads/cache/epub/3328/pg3328.rdf: | |
89: <dcterms:title>Man and Superman: A Comedy and a Philosophy</dcterms:title> | |
/Users/david/Downloads/cache/epub/36437/pg36437.rdf: | |
134: <pgterms:alias>Super, Ovando Byron</pgterms:alias> | |
136: <pgterms:name>Super, O. B. (Ovando Byron)</pgterms:name> | |
/Users/david/Downloads/cache/epub/41481/pg41481.rdf: | |
71: <dcterms:title>Astounding Stories of Super-Science January 1930</dcterms:title> | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment