Skip to content

Instantly share code, notes, and snippets.

@hans
Last active January 21, 2018 16:14
Show Gist options
  • Save hans/6b9c3379c6b09783a31606f679fe4cbb to your computer and use it in GitHub Desktop.
Save hans/6b9c3379c6b09783a31606f679fe4cbb to your computer and use it in GitHub Desktop.
Convert CHILDES XML files to CONLL-U style dependency treebank files.
BEGIN {
c["little"]="size";
c["big"]="size";
c["white"]="color";
c["green"]="color";
c["blue"]="color";
c["red"]="color";
c["good"]="char";
c["bad"]="char";
c["first"]="ord";
c["yellow"]="color";
c["good-CP"]="char";
c["mean"]="char";
c["old"]="age";
c["new"]="age";
c["ole"]="age";
c["terrible"]="char";
c["curious"]="char";
c["nice"]="char";
c["orange"]="color";
c["different"]="other";
c["small"]="size";
c["black"]="color";
c["tall"]="size";
c["short"]="size";
c["pretty"]="char";
c["wild"]="other";
c["last"]="ord";
c["high"]="size";
c["large"]="size";
c["next"]="ord";
c["huge"]="size";
c["violent"]="char";
c["loud"]="sound";
c["big-CP"]="size";
c["own"]="other";
c["wide"]="shape";
c["brown"]="color";
c["final"]="ord";
c["blonde"]="color";
c["pink"]="color";
}
{
if ($1 in c) {
class=c[$1];
} else {
class="?";
}
print $1, $2, class;
# DEV
# if (!class) print
}
"""
Output CoNLL-U parse file from CHILDES corpus.
"""
from argparse import ArgumentParser
import sys
import nltk
from nltk.corpus.reader import CHILDESCorpusReader
def print_conllu(sentence, out):
if not sentence or sentence[0][0] == "xxx":
# ???
return
# Ignore sentences with MWEs. TODO does this create a bad sampling bias?
if any("~" in word for word, _, _ in sentence):
return
# The dependency relation format uses its own indices. First resolve these
# indices to be the same as basic sentence indices.
real_indices = {0: 0}
for i, (word, _, relation) in enumerate(sentence):
given_index = int(relation.split('|')[0])
real_indices[given_index] = i + 1
out = []
root_node = None
for i, (word, tag, relation) in enumerate(sentence):
_, given_head, head_reln = relation.split('|')
out.append(("%s\t" * 10).strip() %
(i + 1, word, "_", tag, "_", "_", real_indices[int(given_head)],
head_reln, "_", "_"))
print("\n".join(out))
print()
def main(args):
corpus = CHILDESCorpusReader(args.dir, args.glob)
for fileid in corpus.fileids():
for sentence in corpus.words(fileid, relation=True):
try:
print_conllu(sentence, sys.stdout)
except:
# Some of the sentences bork because the parses aren't complete. Oh well.
pass
if __name__ == '__main__':
p = ArgumentParser()
p.add_argument("dir")
p.add_argument("glob", default="*.xml")
main(p.parse_args())
#!/bin/bash
python -mdep_tregex grep 'mod cpostag "adj" and deprel "MOD" and <--. (w2 cpostag "n")' < $1 \
| python -mdep_tregex shuf \
| awk '/adj/ {sawAdj=1; adj=$2; next} /n/ {if (sawAdj) print adj, $2; sawAdj=0; next} {sawAdj=0}' \
| awk -f classify.awk
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment