Skip to content

Instantly share code, notes, and snippets.

@dustalov
Last active March 21, 2021 19:33
Show Gist options
  • Save dustalov/5a5f97bc721e3d2b1749ad6b47a289ba to your computer and use it in GitHub Desktop.
Save dustalov/5a5f97bc721e3d2b1749ad6b47a289ba to your computer and use it in GitHub Desktop.
Extract semantic relations from Wiktionary using JWKTL.
import de.tudarmstadt.ukp.jwktl.JWKTL;
import de.tudarmstadt.ukp.jwktl.api.filter.WiktionaryEntryFilter;
import de.tudarmstadt.ukp.jwktl.api.util.Language;
import java.io.File;
import java.util.Locale;
public class ExtractRelations {
public static void main(String[] args) {
if (args.length != 1) {
System.err.println("Usage: java ExtractRelations.java database [filter]");
System.exit(1);
}
final var database = new File(args[0]);
final var prefix = database.getName().toLowerCase(Locale.ROOT).substring(0, 2);
final var filter = new WiktionaryEntryFilter();
if (args.length == 2) {
final var code = args[1];
if (!code.equalsIgnoreCase("nofilter")) {
filter.setAllowedWordLanguages(Language.findByCode(code));
}
} else if (args.length == 1) {
filter.setAllowedWordLanguages(Language.findByCode(prefix));
}
final var wkt = JWKTL.openEdition(database);
for (final var entry: wkt.getAllEntries(filter)) {
for (final var relation: entry.getRelations()) {
System.out.print(entry.getHeader());
System.out.print('\t');
System.out.print(entry.getPartOfSpeech());
System.out.print('\t');
System.out.print(relation.getTarget());
System.out.print('\t');
System.out.print(relation.getRelationType());
System.out.println();
}
}
wkt.close();
}
}
export LANG := en_US.UTF-8
export LC_COLLATE := C
MAVEN := mvn -B -T$(shell nproc) -f "jwktl/pom.xml"
.classpath.env: jwktl
$(MAVEN) package -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -Djacoco.skip=true
$(MAVEN) -q exec:exec -Dexec.executable=echo -Dexec.args="%classpath" > "$@"
jwktl:
git clone "https://github.com/dkpro/dkpro-jwktl.git" "$@"
MIRROR ?= http://dumps.wikimedia.your.org
DUMP ?= 20210320
.PRECIOUS: %-$(DUMP)-pages-articles.xml.bz2
%-$(DUMP)-pages-articles.xml.bz2:
curl -Lfo "$@" "$(MIRROR)/$*/$(DUMP)/$@"
CLASSPATH = $(shell cat .classpath.env)
.PRECIOUS: %-$(DUMP)
%-$(DUMP): %-$(DUMP)-pages-articles.xml.bz2 | .classpath.env
java -cp "$(CLASSPATH)" -Djdk.xml.totalEntitySizeLimit=100000000 de.tudarmstadt.ukp.jwktl.examples.Example1_ParseWiktionaryDump "$<" "$@" "true" > "[email protected]"
%-$(DUMP).tsv: %-$(DUMP) | .classpath.env
java -cp "$(CLASSPATH)" ExtractRelations.java "$<" > "$@"
enwiktionary: enwiktionary-$(DUMP).tsv
ruwiktionary: ruwiktionary-$(DUMP).tsv
dewiktionary: dewiktionary-$(DUMP).tsv
@dustalov
Copy link
Author

$ ./extract-relations.groovy enwiktionary-20171103 en | head | column -ts $'\t'
dictionary  NOUN  dicktionary               DERIVED_TERM
dictionary  NOUN  dictionaric               DERIVED_TERM
dictionary  NOUN  dictionarily              DERIVED_TERM
dictionary  NOUN  encyclopedic dictionary   DERIVED_TERM
dictionary  NOUN  explanatory dictionary    DERIVED_TERM
dictionary  NOUN  fictionary                DERIVED_TERM
dictionary  NOUN  pedagogical dictionary    DERIVED_TERM
dictionary  NOUN  Pictionary                DERIVED_TERM
dictionary  NOUN  pronunciation dictionary  DERIVED_TERM
dictionary  NOUN  rhyming dictionary        DERIVED_TERM

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment