Last active
January 21, 2018 16:14
-
-
Save hans/6b9c3379c6b09783a31606f679fe4cbb to your computer and use it in GitHub Desktop.
Convert CHILDES XML files to CONLL-U style dependency treebank files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEGIN { | |
c["little"]="size"; | |
c["big"]="size"; | |
c["white"]="color"; | |
c["green"]="color"; | |
c["blue"]="color"; | |
c["red"]="color"; | |
c["good"]="char"; | |
c["bad"]="char"; | |
c["first"]="ord"; | |
c["yellow"]="color"; | |
c["good-CP"]="char"; | |
c["mean"]="char"; | |
c["old"]="age"; | |
c["new"]="age"; | |
c["ole"]="age"; | |
c["terrible"]="char"; | |
c["curious"]="char"; | |
c["nice"]="char"; | |
c["orange"]="color"; | |
c["different"]="other"; | |
c["small"]="size"; | |
c["black"]="color"; | |
c["tall"]="size"; | |
c["short"]="size"; | |
c["pretty"]="char"; | |
c["wild"]="other"; | |
c["last"]="ord"; | |
c["high"]="size"; | |
c["large"]="size"; | |
c["next"]="ord"; | |
c["huge"]="size"; | |
c["violent"]="char"; | |
c["loud"]="sound"; | |
c["big-CP"]="size"; | |
c["own"]="other"; | |
c["wide"]="shape"; | |
c["brown"]="color"; | |
c["final"]="ord"; | |
c["blonde"]="color"; | |
c["pink"]="color"; | |
} | |
{ | |
if ($1 in c) { | |
class=c[$1]; | |
} else { | |
class="?"; | |
} | |
print $1, $2, class; | |
# DEV | |
# if (!class) print | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Output CoNLL-U parse file from CHILDES corpus. | |
""" | |
from argparse import ArgumentParser | |
import sys | |
import nltk | |
from nltk.corpus.reader import CHILDESCorpusReader | |
def print_conllu(sentence, out): | |
if not sentence or sentence[0][0] == "xxx": | |
# ??? | |
return | |
# Ignore sentences with MWEs. TODO does this create a bad sampling bias? | |
if any("~" in word for word, _, _ in sentence): | |
return | |
# The dependency relation format uses its own indices. First resolve these | |
# indices to be the same as basic sentence indices. | |
real_indices = {0: 0} | |
for i, (word, _, relation) in enumerate(sentence): | |
given_index = int(relation.split('|')[0]) | |
real_indices[given_index] = i + 1 | |
out = [] | |
root_node = None | |
for i, (word, tag, relation) in enumerate(sentence): | |
_, given_head, head_reln = relation.split('|') | |
out.append(("%s\t" * 10).strip() % | |
(i + 1, word, "_", tag, "_", "_", real_indices[int(given_head)], | |
head_reln, "_", "_")) | |
print("\n".join(out)) | |
print() | |
def main(args): | |
corpus = CHILDESCorpusReader(args.dir, args.glob) | |
for fileid in corpus.fileids(): | |
for sentence in corpus.words(fileid, relation=True): | |
try: | |
print_conllu(sentence, sys.stdout) | |
except: | |
# Some of the sentences bork because the parses aren't complete. Oh well. | |
pass | |
if __name__ == '__main__': | |
p = ArgumentParser() | |
p.add_argument("dir") | |
p.add_argument("glob", default="*.xml") | |
main(p.parse_args()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
python -mdep_tregex grep 'mod cpostag "adj" and deprel "MOD" and <--. (w2 cpostag "n")' < $1 \ | |
| python -mdep_tregex shuf \ | |
| awk '/adj/ {sawAdj=1; adj=$2; next} /n/ {if (sawAdj) print adj, $2; sawAdj=0; next} {sawAdj=0}' \ | |
| awk -f classify.awk |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment