Created
March 6, 2022 14:33
-
-
Save Sciss/a5be9243a666e57dad2c9ee03b718111 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val ocrDir = file("/data/texts/ocr") | |
def all = ocrDir.children(_.extL == "txt") | |
// all.size | |
def findOCR2(words: List[String], contentsMaxChars: Int = 72, maxFileName: Int = 72): Unit = | |
new Thread { | |
override def run(): Unit = { | |
all.sorted(File.NameOrdering).foreach { f => | |
// println(f.name) | |
val fIn = new java.io.FileInputStream(f) | |
val contents = try { | |
val arr = new Array[Byte](fIn.available()) | |
fIn.read(arr) | |
new String(arr, "UTF-8") | |
} finally { | |
fIn.close() | |
} | |
val contentsL = contents.toLowerCase | |
var ix = words.map(w => (w -> contentsL.indexOf(w))).filter(_._2 >= 0) | |
// val j = contentsL.indexOf(not) | |
for ((word, i0) <- ix) { | |
var i = i0 | |
while (i >= 0) { | |
val j = math.max(0, i - contentsMaxChars) | |
val k = math.min(contents.length, i + math.max(word.length, contentsMaxChars)) | |
val sub0 = contentsL.substring(j, k) | |
val sub = sub0.replaceAll("\\n", " ") | |
// val split = sub.split("\\s+") | |
// val numSplit = split.length | |
// val overhead = numSplit - contextMaxWords | |
// val sel = if (overhead <= 0) split else split.slice(overhead/2, numSplit - ((overhead + 1)/2)) | |
// val text = sel.mkString(" ", " ", "") | |
val m0 = " " * contentsMaxChars | |
val n0 = m0 + sub + m0 | |
val o0 = n0.indexOf(word) | |
val text0 = n0.substring(o0 - contentsMaxChars, o0 + word.length + contentsMaxChars) | |
val m = text0.indexOf(".") | |
val text1 = if (m < 0 || m >= text0.toLowerCase.indexOf(word)) text0 else text0.substring(m + 1) | |
val n = text1.toLowerCase.indexOf(word) | |
val o = text1.indexOf(".", n + 1) | |
val text2 = if (o < 0) text1 else text1.substring(0, o + 1) | |
val p = text2.toLowerCase.indexOf(word) | |
val text3 = (" " * (contentsMaxChars - p)) ++ text2 | |
val text4 = text3 + (" " * (contentsMaxChars * 2 + word.length + 1 - text3.length)) | |
val text = text4.map { ch => | |
if (ch >= ' ' && ch < 128) ch | |
else if ("äöüßÄÖÜáàéèíìóòúù'ÁÀÉÈÍÌÓÒÚÙ".contains(ch)) ch | |
else ' ' | |
} | |
println(text + f.base.take(maxFileName)) | |
i = contentsL.indexOf(word, i + word.length) | |
} | |
} | |
} | |
println("\nDone.") | |
} | |
start() | |
} | |
findOCR2(List("operationali"), contentsMaxChars = 64, maxFileName = 70) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment