hans · January 21, 2018 16:14
diff --git a/classify.awk b/classify.awk
 BEGIN {
  c["little"]="size";
  c["big"]="size";
  c["white"]="color";
  c["green"]="color";
  c["blue"]="color";
  c["red"]="color";
  c["good"]="char";
  c["bad"]="char";
  c["first"]="ord";
  c["yellow"]="color";
  c["good-CP"]="char";
  c["mean"]="char";
  c["old"]="age";
  c["new"]="age";
  c["ole"]="age";
  c["terrible"]="char";
  c["curious"]="char";
  c["nice"]="char";
  c["orange"]="color";
  c["different"]="other";
  c["small"]="size";
  c["black"]="color";
  c["tall"]="size";
  c["short"]="size";
  c["pretty"]="char";
  c["wild"]="other";
  c["last"]="ord";
  c["high"]="size";
  c["large"]="size";
  c["next"]="ord";
  c["huge"]="size";
  c["violent"]="char";
  c["loud"]="sound";
  c["big-CP"]="size";
  c["own"]="other";
  c["wide"]="shape";
  c["brown"]="color";
  c["final"]="ord";
  c["blonde"]="color";
  c["pink"]="color";
 }

 {
  if ($1 in c) {
    class=c[$1];
  } else {
    class="?";
  }

  print $1, $2, class;
  # DEV
  # if (!class) print
 }
diff --git a/convert_conllu.py b/convert_conllu.py
 """
 Output CoNLL-U parse file from CHILDES corpus.
 """

 from argparse import ArgumentParser
 import sys

 import nltk
 from nltk.corpus.reader import CHILDESCorpusReader


 def print_conllu(sentence, out):
  if not sentence or sentence[0][0] == "xxx":
    # ???
    return

  # Ignore sentences with MWEs. TODO does this create a bad sampling bias?
  if any("~" in word for word, _, _ in sentence):
    return

  # The dependency relation format uses its own indices. First resolve these
  # indices to be the same as basic sentence indices.
  real_indices = {0: 0}
  for i, (word, _, relation) in enumerate(sentence):
    given_index = int(relation.split('|')[0])
    real_indices[given_index] = i + 1

  out = []
  root_node = None
  for i, (word, tag, relation) in enumerate(sentence):
    _, given_head, head_reln = relation.split('|')

    out.append(("%s\t" * 10).strip() %
        (i + 1, word, "_", tag, "_", "_", real_indices[int(given_head)],
         head_reln, "_", "_"))

  print("\n".join(out))
  print()


 def main(args):
  corpus = CHILDESCorpusReader(args.dir, args.glob)
  for fileid in corpus.fileids():
    for sentence in corpus.words(fileid, relation=True):
      try:
        print_conllu(sentence, sys.stdout)
      except:
        # Some of the sentences bork because the parses aren't complete. Oh well.
        pass


 if __name__ == '__main__':
  p = ArgumentParser()

  p.add_argument("dir")
  p.add_argument("glob", default="*.xml")

  main(p.parse_args())
diff --git a/filter.sh b/filter.sh
 #!/bin/bash


 python -mdep_tregex grep 'mod cpostag "adj" and deprel "MOD" and <--. (w2 cpostag "n")' < $1 \
  | python -mdep_tregex shuf \
  | awk '/adj/ {sawAdj=1; adj=$2; next} /n/ {if (sawAdj) print adj, $2; sawAdj=0; next} {sawAdj=0}' \
  | awk -f classify.awk
	BEGIN {
	c["little"]="size";
	c["big"]="size";
	c["white"]="color";
	c["green"]="color";
	c["blue"]="color";
	c["red"]="color";
	c["good"]="char";
	c["bad"]="char";
	c["first"]="ord";
	c["yellow"]="color";
	c["good-CP"]="char";
	c["mean"]="char";
	c["old"]="age";
	c["new"]="age";
	c["ole"]="age";
	c["terrible"]="char";
	c["curious"]="char";
	c["nice"]="char";
	c["orange"]="color";
	c["different"]="other";
	c["small"]="size";
	c["black"]="color";
	c["tall"]="size";
	c["short"]="size";
	c["pretty"]="char";
	c["wild"]="other";
	c["last"]="ord";
	c["high"]="size";
	c["large"]="size";
	c["next"]="ord";
	c["huge"]="size";
	c["violent"]="char";
	c["loud"]="sound";
	c["big-CP"]="size";
	c["own"]="other";
	c["wide"]="shape";
	c["brown"]="color";
	c["final"]="ord";
	c["blonde"]="color";
	c["pink"]="color";
	}

	{
	if ($1 in c) {
	class=c[$1];
	} else {
	class="?";
	}

	print $1, $2, class;
	# DEV
	# if (!class) print
	}
	"""
	Output CoNLL-U parse file from CHILDES corpus.
	"""

	from argparse import ArgumentParser
	import sys

	import nltk
	from nltk.corpus.reader import CHILDESCorpusReader


	def print_conllu(sentence, out):
	if not sentence or sentence[0][0] == "xxx":
	# ???
	return

	# Ignore sentences with MWEs. TODO does this create a bad sampling bias?
	if any("~" in word for word, _, _ in sentence):
	return

	# The dependency relation format uses its own indices. First resolve these
	# indices to be the same as basic sentence indices.
	real_indices = {0: 0}
	for i, (word, _, relation) in enumerate(sentence):
	given_index = int(relation.split('\|')[0])
	real_indices[given_index] = i + 1

	out = []
	root_node = None
	for i, (word, tag, relation) in enumerate(sentence):
	_, given_head, head_reln = relation.split('\|')

	out.append(("%s\t" * 10).strip() %
	(i + 1, word, "_", tag, "_", "_", real_indices[int(given_head)],
	head_reln, "_", "_"))

	print("\n".join(out))
	print()


	def main(args):
	corpus = CHILDESCorpusReader(args.dir, args.glob)
	for fileid in corpus.fileids():
	for sentence in corpus.words(fileid, relation=True):
	try:
	print_conllu(sentence, sys.stdout)
	except:
	# Some of the sentences bork because the parses aren't complete. Oh well.
	pass


	if __name__ == '__main__':
	p = ArgumentParser()

	p.add_argument("dir")
	p.add_argument("glob", default="*.xml")

	main(p.parse_args())
	#!/bin/bash


	python -mdep_tregex grep 'mod cpostag "adj" and deprel "MOD" and <--. (w2 cpostag "n")' < $1 \
	\| python -mdep_tregex shuf \
	\| awk '/adj/ {sawAdj=1; adj=$2; next} /n/ {if (sawAdj) print adj, $2; sawAdj=0; next} {sawAdj=0}' \
	\| awk -f classify.awk