Last active
May 25, 2024 10:20
-
-
Save dacr/6292d44b720782353d0d6c2ab0bdf99c to your computer and use it in GitHub Desktop.
french hunspell from grammalecte / published by https://github.com/dacr/code-examples-manager #5c8a6863-c517-4c5f-bf7e-540b6d479c91/1f17f431556a113071ec58e1af7c37ef72ffd1e
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// summary : french hunspell from grammalecte | |
// keywords : scala, zio, sttp, nio, words, spell, parse, unzip, zip, files, hunspell, @testable | |
// publish : gist | |
// authors : David Crosson | |
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2) | |
// id : 5c8a6863-c517-4c5f-bf7e-540b6d479c91 | |
// created-on : 2022-01-23T10:00:00Z | |
// managed-by : https://github.com/dacr/code-examples-manager | |
// run-with : scala-cli $file | |
// --------------------- | |
//> using scala "3.4.2" | |
//> using dep "dev.zio::zio:2.0.0" | |
//> using dep "dev.zio::zio-json:0.3.0-RC10" | |
//> using dep "com.softwaremill.sttp.client3::zio:3.7.0" | |
//> using dep "dev.zio::zio-nio:2.0.0" | |
// --------------------- | |
import zio.* | |
import zio.json.* | |
import zio.nio.* | |
import zio.nio.file.* | |
import sttp.client3.*, sttp.client3.basicRequest.* | |
import sttp.client3.httpclient.zio.HttpClientZioBackend | |
import zio.nio.charset.Charset | |
import java.io.ByteArrayInputStream | |
import java.nio.file.StandardOpenOption.* | |
import java.util.zip.* | |
object WordleDic extends ZIOAppDefault { | |
val destDicoZipFile = Path("dico.zip") | |
val dicEntryKey = "fr-classique" | |
def downloadLogic(dest: Path) = { | |
for { | |
backend <- HttpClientZioBackend() | |
src = uri"http://grammalecte.net/download/fr/hunspell-french-dictionaries-v7.0.zip" | |
request = basicRequest.get(src) | |
response <- backend.send(request.response(asByteArray)) | |
body <- ZIO.fromEither(response.body) | |
_ <- Files.writeBytes(dest, Chunk.fromArray(body), TRUNCATE_EXISTING, CREATE) | |
_ <- Console.printLine(s"wrote to $dest") | |
} yield () | |
} | |
def extractZipEntryContentAsBytes(zipInputStream: ZipInputStream, entryName: String) = { | |
LazyList | |
.continually(zipInputStream.getNextEntry) | |
.takeWhile(_ != null) | |
.find(entry => entry.getName == entryName) | |
.map(entry => Chunk.fromArray(zipInputStream.readAllBytes())) | |
} | |
def zipExtractLogic(src: Path, entryName: String) = { | |
for { | |
zipContent <- Files.readAllBytes(src) | |
bytes = ByteArrayInputStream(zipContent.toArray) | |
// TODO | |
// zipInputStream <- ZManaged.acquireReleaseAttemptWith(ZipInputStream(bytes))(_.close()).useNow | |
zipInputStream <- ZIO.succeed(ZipInputStream(bytes)) | |
_ <- Console.printLine(s"searching for $entryName") | |
bytes <- ZIO.attemptBlockingIO(extractZipEntryContentAsBytes(zipInputStream, entryName)).some | |
} yield bytes | |
} | |
case class HunspellEntry(word: String, flags: Option[String], attributes: Map[String, String]) { | |
val isDiv = attributes.get("po") == Some("div") // Separator | |
val isCommun = word.head.isLower // Nom commun | |
val isCompound = word.contains("-") | |
val isPropre = attributes.get("po") == Some("npr") | |
val isFirstName = attributes.get("po") == Some("prn") | |
} | |
object HunspellEntry { | |
def fromLine(line: String): Option[HunspellEntry] = { | |
val parts = line.trim().split("""\s+""").toList | |
val attributes = | |
parts | |
.drop(1) | |
.map(_.split(":", 2)) | |
.collect { case Array(key, value) => key -> value } | |
.toMap | |
parts.headOption.getOrElse("").split("/", 2) match { | |
case Array(word) => Some(HunspellEntry(word, None, attributes)) | |
case Array(word, flags) => Some(HunspellEntry(word, Some(flags), attributes)) | |
case _ => None | |
} | |
} | |
} | |
case class Hunspell(entries: List[HunspellEntry]) | |
def parseHunspell(dicBytes: Chunk[Byte], affixBytes: Chunk[Byte]) = { | |
val charset = Charset.Standard.utf8 | |
for { | |
content <- charset.decodeString(dicBytes) | |
lines = content.split("\n").toList | |
count <- ZIO.fromOption(lines.headOption.map(_.toInt)) | |
_ <- Console.printLine(s"Expecting to find $count hunspell entries") | |
specs = lines.tail | |
entries = specs.flatMap(HunspellEntry.fromLine) | |
_ <- Console.printLine(s"Found ${entries.size} hunspell entries") | |
// hunspell <- ZIO.cond(entries.size == count, Hunspell(entries), Error("Didn't find the right number of words in dictionary")) | |
hunspell = Hunspell(entries) // No check as count input data looks invalid :( | |
} yield hunspell | |
} | |
def dumpStats(hunspell: Hunspell) = { | |
val selectedWords = hunspell.entries.filter(entry => entry.isCommun && !entry.isCompound) | |
val wordsBySize = selectedWords.groupBy(_.word.size) | |
val countBySize = wordsBySize.view.mapValues(_.size).toMap | |
val longestWords = wordsBySize.get(wordsBySize.keys.max).getOrElse(Nil).map(_.word) | |
for { | |
_ <- Console.printLine(s"For common & not compound words") | |
_ <- Console.printLine(s" Found ${selectedWords.size} words") | |
_ <- Console.printLine(s" Number of common & not compound words By size") | |
_ <- Console.printLine(countBySize.toList.sorted.mkString(" ", "\n ", "\n")) | |
_ <- Console.printLine(s" Longest french words") | |
_ <- Console.printLine(longestWords.sorted.mkString(" ", "\n ", "\n")) | |
_ <- Console.printLine(s"Found ${hunspell.entries.size} words in the dictionary") | |
} yield () | |
} | |
def naiveSearch(hunspell: Hunspell, pattern: String, excludedLetters: String = "", includedLetters: String = ""): List[String] = { | |
def normalize(word: String): String = | |
word.toLowerCase | |
.replaceAll("[áàäâ]", "a") | |
.replaceAll("[éèëê]", "e") | |
.replaceAll("[íìïî]", "i") | |
.replaceAll("[óòöô]", "o") | |
.replaceAll("[úùüû]", "u") | |
.replaceAll("[ç]", "c") | |
.toUpperCase | |
val wordRE = pattern.replaceAll("_", ".").r | |
hunspell.entries.view | |
.filterNot(_.isCompound) | |
.filter(_.isCommun) | |
.map(_.word) | |
.filter(_.size == pattern.size) | |
.map(normalize) | |
.filter(wordRE.matches) | |
.filterNot(_.exists(excludedLetters.contains)) | |
.filter(word => includedLetters.forall(word.contains)) | |
.toList | |
} | |
override def run = for { | |
_ <- if (!destDicoZipFile.toFile.exists()) downloadLogic(destDicoZipFile) else ZIO.succeed(()) | |
affBytes <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.aff") | |
dicBytes <- zipExtractLogic(destDicoZipFile, s"$dicEntryKey.dic") | |
hunspell <- parseHunspell(dicBytes, affBytes) | |
_ <- dumpStats(hunspell) | |
givenToSolve <- getArgs.map(_.headOption) | |
givenIncludedLetters <- getArgs.map(_.drop(1).headOption) | |
givenExcludedLetters <- getArgs.map(_.drop(2).headOption) | |
toSolve = givenToSolve.map(_.toUpperCase).getOrElse("S_______") | |
excludedLetters = givenExcludedLetters.map(_.toUpperCase).getOrElse("") | |
includedLetters = givenIncludedLetters.map(_.toUpperCase).getOrElse("") | |
solveOri = toSolve.head + toSolve.tail.map(_ => '_') | |
_ <- Console.printLine(s"Possible solution count ${naiveSearch(hunspell, solveOri).size} for $solveOri") | |
_ <- Console.printLine(s"Candidate solutions for $toSolve while including '$includedLetters' and excluding '$excludedLetters'") | |
_ <- Console.printLine(naiveSearch(hunspell, toSolve, excludedLetters, includedLetters).mkString(" ")) | |
} yield () | |
} | |
WordleDic.main(Array.empty) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment