Last active
March 21, 2021 19:33
-
-
Save dustalov/5a5f97bc721e3d2b1749ad6b47a289ba to your computer and use it in GitHub Desktop.
Extract semantic relations from Wiktionary using JWKTL.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import de.tudarmstadt.ukp.jwktl.JWKTL; | |
import de.tudarmstadt.ukp.jwktl.api.filter.WiktionaryEntryFilter; | |
import de.tudarmstadt.ukp.jwktl.api.util.Language; | |
import java.io.File; | |
import java.util.Locale; | |
public class ExtractRelations { | |
public static void main(String[] args) { | |
if (args.length != 1) { | |
System.err.println("Usage: java ExtractRelations.java database [filter]"); | |
System.exit(1); | |
} | |
final var database = new File(args[0]); | |
final var prefix = database.getName().toLowerCase(Locale.ROOT).substring(0, 2); | |
final var filter = new WiktionaryEntryFilter(); | |
if (args.length == 2) { | |
final var code = args[1]; | |
if (!code.equalsIgnoreCase("nofilter")) { | |
filter.setAllowedWordLanguages(Language.findByCode(code)); | |
} | |
} else if (args.length == 1) { | |
filter.setAllowedWordLanguages(Language.findByCode(prefix)); | |
} | |
final var wkt = JWKTL.openEdition(database); | |
for (final var entry: wkt.getAllEntries(filter)) { | |
for (final var relation: entry.getRelations()) { | |
System.out.print(entry.getHeader()); | |
System.out.print('\t'); | |
System.out.print(entry.getPartOfSpeech()); | |
System.out.print('\t'); | |
System.out.print(relation.getTarget()); | |
System.out.print('\t'); | |
System.out.print(relation.getRelationType()); | |
System.out.println(); | |
} | |
} | |
wkt.close(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export LANG := en_US.UTF-8 | |
export LC_COLLATE := C | |
MAVEN := mvn -B -T$(shell nproc) -f "jwktl/pom.xml" | |
.classpath.env: jwktl | |
$(MAVEN) package -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -Djacoco.skip=true | |
$(MAVEN) -q exec:exec -Dexec.executable=echo -Dexec.args="%classpath" > "$@" | |
jwktl: | |
git clone "https://github.com/dkpro/dkpro-jwktl.git" "$@" | |
MIRROR ?= http://dumps.wikimedia.your.org | |
DUMP ?= 20210320 | |
.PRECIOUS: %-$(DUMP)-pages-articles.xml.bz2 | |
%-$(DUMP)-pages-articles.xml.bz2: | |
curl -Lfo "$@" "$(MIRROR)/$*/$(DUMP)/$@" | |
CLASSPATH = $(shell cat .classpath.env) | |
.PRECIOUS: %-$(DUMP) | |
%-$(DUMP): %-$(DUMP)-pages-articles.xml.bz2 | .classpath.env | |
java -cp "$(CLASSPATH)" -Djdk.xml.totalEntitySizeLimit=100000000 de.tudarmstadt.ukp.jwktl.examples.Example1_ParseWiktionaryDump "$<" "$@" "true" > "[email protected]" | |
%-$(DUMP).tsv: %-$(DUMP) | .classpath.env | |
java -cp "$(CLASSPATH)" ExtractRelations.java "$<" > "$@" | |
enwiktionary: enwiktionary-$(DUMP).tsv | |
ruwiktionary: ruwiktionary-$(DUMP).tsv | |
dewiktionary: dewiktionary-$(DUMP).tsv |
Author
dustalov
commented
Nov 22, 2017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment