Created
August 11, 2016 19:04
-
-
Save dirkgr/a1852a89b980cf21e973190fb414a02c to your computer and use it in GitHub Desktop.
Lists all files under a directory, recursively, and fast, in Scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
logger.info(s"Finding out how many files to process in $inputDirectory") | |
// work around Java's dumb file IO methods | |
def forEachFileToProcess(f: Path => Unit): Unit = { | |
val filenamePattern = "^([a-f0-9]{40})\\.pdf$"r | |
val visitor = new FileVisitor[Path] { | |
override def visitFileFailed(file: Path, exc: IOException): FileVisitResult = | |
FileVisitResult.SKIP_SUBTREE | |
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = { | |
file.getFileName.toString match { | |
case filenamePattern(sha) if !seenPaperIds.contains(sha) => f(file) | |
case _ => /* nothing */ | |
} | |
FileVisitResult.CONTINUE | |
} | |
override def preVisitDirectory(dir: Path, attrs: BasicFileAttributes): FileVisitResult = | |
FileVisitResult.CONTINUE | |
override def postVisitDirectory(dir: Path, exc: IOException): FileVisitResult = | |
FileVisitResult.CONTINUE | |
} | |
Files.walkFileTree(Paths.get(inputDirectory), visitor) | |
} | |
val filesToProcess = { | |
val buffer = mutable.Buffer[Path]() | |
forEachFileToProcess { path => buffer += path } | |
buffer.par | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Java's old IO has some functions for this too which are easier to use, but they either have problems with error handling, are super slow once you have more than a trivial number of files, or both.