Skip to content

Instantly share code, notes, and snippets.

Created April 24, 2019 19:54
Show Gist options
  • Save mayhewsw/31153a1718a2be92db10d6d7003e486f to your computer and use it in GitHub Desktop.
Save mayhewsw/31153a1718a2be92db10d6d7003e486f to your computer and use it in GitHub Desktop.
Reader for LLF dictionaries, probably.
import edu.illinois.cs.cogcomp.core.algorithms.LevensteinDistance;
import edu.illinois.cs.cogcomp.core.datastructures.Pair;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotationUtilities;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View;
import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
import org.apache.commons.lang3.StringUtils;
import edu.illinois.cs.cogcomp.transliteration.SPModel;
import edu.illinois.cs.cogcomp.utils.TopList;
import java.util.*;
public class Dictionary {
HashMap<String, List<String>> entries = new HashMap<>();
String[] suffixes = {};
HashMap<String, List<String>> index2 = new HashMap<>();
HashMap<String, List<String>> index3 = new HashMap<>();
HashMap<String, List<String>> index4 = new HashMap<>();
HashMap<String, String> confidentdict = new HashMap<>();
private HashSet<String> stopwords = new HashSet<>();
public Dictionary(String fname) throws IOException {
List<String> lines =;
String word = "";
for(String line : lines){
String text = line.replaceAll("\\<[^>]*>","").trim().toLowerCase();
if(line.contains("<LEMMA") || line.contains("<WORD")){
if(!entries.containsKey(text)) {
entries.put(text, new ArrayList<String>());
word = text;
}else if(line.contains("<GLOSS")){
List<String> wwlines = new ArrayList<>();
int wordword = 0;
for(String entry : entries.keySet()){
List<String> def = entries.get(entry);
if(entry.split(" ").length == 1 && def.get(0).split(" ").length == 1){
wwlines.add(entry + "\t" + def.get(0));
}else if(entry.split(" ").length == 2 &&
def.get(0).split(" ").length == 2 &&
!def.get(0).contains(",") &&
!def.get(0).startsWith("to ")){
for(int i = 0; i < 2; i ++) {
wwlines.add(entry.split(" ")[i] + "\t" + def.get(0).split(" ")[i]);
System.out.println("There are " + wordword + " entries out of " + entries.size());
private void addToIndices(String entry) {
List<String> bigrams = getNgrams(entry, 2);
List<String> trigrams = getNgrams(entry, 3);
List<String> quadrigrams = getNgrams(entry, 4);
for(String bi : bigrams){
index2.put(bi, new ArrayList<String>());
for(String tri : trigrams){
index3.put(tri, new ArrayList<String>());
for(String q : quadrigrams){
index4.put(q, new ArrayList<String>());
* This Writes out to masterlex format.
* @param outfname
* @throws IOException
public void write(String outfname) throws IOException {
List<String> outlines = new ArrayList<>();
for(String entry : entries.keySet()){
List<String> defs = entries.get(entry);
for(String def : defs){
def = def.trim().replaceAll("\\.$", "");
def = def.trim().replaceAll("1$", "");
def = def.replaceAll("\\(.*\\)", "");
def = def.replaceAll("\\[.*\\]", "");
def = def.replaceAll("«.*»", "").trim();
String[] commadefs = { def };
if(def.contains(",")) {
commadefs = def.split(",");
}else if(def.contains(";")){
commadefs = def.split(";");
for(String cd : commadefs) {
cd = cd.trim().replaceAll("\\.$", "");
cd = cd.replaceAll("\\(.*\\)", "");
cd = cd.replaceAll("\\[.*\\]", "");
cd = cd.replaceAll("«.*»", "");
if(cd.trim().length() > 0) {
outlines.add(entry + "\tN/A\tN/A\tN/A\tN/A\t" + cd.trim() + "\tN/A\tN/A\tN/A\tloreleidict\tN/A");
LineIO.write(outfname, outlines);
* Given a string, and an integer, this returns all character ngrams
* of that string.
* @param s
* @param n
* @return
private List<String> getNgrams(String s, int n){
List<String> ret = new ArrayList<>();
if(s.length() < n){
return ret;
for(int i = 0; i < s.length()-n+1; i++){
ret.add(s.substring(i, i+n));
return ret;
public String indexlookup(String w, double ratio){
return pairlookup(w, ratio).getSecond();
public String indexlookup(String w){
return pairlookup(w).getSecond();
public Pair<String,String> pairlookup(String w){
return pairlookup(w, 0.9);
public Pair<String,String> pairlookup(String w, double ratio){
return new Pair<>(w, entries.get(w).get(0));
for(String suffix : suffixes){
String ws = StringUtils.removeEnd(w, suffix);
return pairlookup(ws, ratio);
List<String> all = new ArrayList<>();
for(String b : getNgrams(w, 2)){
if(index2.containsKey(b)) {
List<String> trigrams = getNgrams(w, 3);
for(String b : trigrams){
if(index3.containsKey(b)) {
List<String> quadrigrams = getNgrams(w, 4);
for(String b : quadrigrams){
if(index4.containsKey(b)) {
// order all by size --> shorter at the beginning.
// this means that the first element in the list to
// beat themax wins. This will bias towards shorter elements.
Collections.sort(all, new Comparator<String>() {
public int compare(String o1, String o2) {
return o1.length() - o2.length();
HashMap<String, Integer> freqs = new HashMap<>();
for(String entry : all){
freqs.put(entry, 0);
freqs.put(entry, freqs.get(entry)+1);
double max = 0;
String best = null;
for(String entry : freqs.keySet()){
//double score = freqs.get(entry) / (float)entry.length();
if(freqs.get(entry) > 1) {
double score = freqs.get(entry) / LevensteinDistance.getLevensteinDistance(entry, w);
if (score > max) {
max = score;
best = entry;
//double threshold =ratio*(trigrams.size() + quadrigrams.size()) / w.length();
double threshold = ratio * w.length();
if(best != null && max > threshold) {
return new Pair<>(best, entries.get(best).get(0));
return new Pair<>(null, null);
* This is entirely in English. The goal is to return a single word.
* @param def
* @return
public String cleandefinition(String def){
if(def == null){
return null;
def = def.trim();
def = def.replaceAll("\\s*\\(.*\\)\\s*", "");
def = def.replaceAll("\\s*\\[.*\\]\\s*", "");
def = def.replaceAll("\\s*«.*»\\s*", "");
def = def.replaceAll("\\.", "");
def = def.split("[;,]")[0];
for(String stopword : stopwords){
def = def.replaceAll(" " + stopword + " ", " ");
def = def.replaceAll("^" + stopword + " ", " ");
def = def.replaceAll(" " + stopword + "$", " ");
def = def.trim();
return def;
public static void main(String[] args) throws Exception {
String dir = "/path/to/dictionaries/";
String fname = dir + "dictionary.llf.xml";
Dictionary d = new Dictionary(fname);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment