hans · August 13, 2018 18:06
diff --git a/build_verb_argument_vocab.py b/build_verb_argument_vocab.py
 """
 Calculate statistics on verb-argument pairings given a parsed corpus
 of CoNLL-U-formatted files. Part-of-speech tags and dependency heads+labels
 are required.
 """

 from collections import Counter
 from pathlib import Path
 import re
 import sys

 from tqdm import tqdm


 object_stopwords = frozenset([
    "what",
    "thing",
    "something",
    "anything",
    "nothing",
    "everything",
    "anyone",
    "everyone",
    "someone",
    "nobody",
    "nowhere",
    "anywhere",
    "everywhere",
    "anytime",
    "other",
 ])

 # Exclude arguments which are likely to be non-countable / unlikely to accept
 # an indefinite
 exclude_re = re.compile("ness$|ssion$")


 def consume_sentence(vocab, sentence):
    """
    Update vocabulary given the new sentence.
    """
    # Find root verb.
    try:
        root_idx, root_word = next((i, word) for i, (word, _, _, deprel) in enumerate(sentence)
                                   if deprel == "ROOT")
    except:
        # No root in sentence. Do nothing.
        return

    # Find direct object.
    try:
        dobj_word = next(word for word, pos, head, deprel in sentence
                         if pos == "NOUN" and head == root_idx and deprel == "dobj"
                            and word not in object_stopwords
                            and not exclude_re.search(word))
    except:
        return

    root_word = root_word.lower()
    dobj_word = dobj_word.lower()
    vocab[root_word, dobj_word] += 1



 def main(corpus_dir):
    vocab = Counter()
    files = list(Path(corpus_dir).glob("**/*.txt"))

    for doc in tqdm(files, desc="Reading files"):
        tqdm.write("Vocab size so far: %i" % len(vocab), file=sys.stderr)
        with doc.open("r") as doc_f:
            sentence = []
            for line in doc_f:
                line = line.strip()
                if line:
                    fields = line.split("\t")
                    try:
                        lem = fields[2].lower()
                        pos = fields[3]
                        head = int(fields[6]) - 1
                        deprel = fields[7]
                    except:
                        continue
                    else:
                        sentence.append((lem, pos, head, deprel))
                else:
                    consume_sentence(vocab, sentence)
                    sentence = []

    for (verb, obj), count in vocab.most_common():
        print("%s\t%s\t%i" % (verb, obj, count))


 if __name__ == '__main__':
    main(sys.argv[1])
diff --git a/build_vocab.py b/build_vocab.py
 from collections import Counter
 from pathlib import Path
 import sys

 from tqdm import tqdm


 def main(corpus_dir):
    vocab = Counter()
    files = list(Path(corpus_dir).glob("**/*.txt"))
    for doc in tqdm(files, desc="Reading files"):
        tqdm.write("Vocab size so far: %i" % len(vocab))
        with doc.open("r") as doc_f:
            for line in doc_f:
                line = line.strip()
                if line:
                    fields = line.split("\t")
                    try:
                        word = fields[1].lower()
                    except:
                        continue
                    else:
                        vocab[word] += 1

    for token, count in vocab.most_common():
        print("%s\t%i" % (token, count))


 if __name__ == '__main__':
    main(sys.argv[1])
diff --git a/search.py b/search.py
 """
 Search for tuples of nouns and verbs (n1, n2, v1, v2) satisfying the following constraints:

    1. n1 begins with a vowel
    2. n2 begins with a consonant
    3. v1-n1 is attested and v2-n2 is attested
      3a. Relative frequencies are similar -- 1/2 < freq(v1-n1) / freq(v2-n2) < 2
    4. [requires human] v1-n2 and v2-n1 yield a lexical-semantic violation (==> v1 and v2 have nontrivial selectional preferences)
    5. [requires human] n1 and n2 must have different singular and plural wordforms
    6. [requires human] n1 and n2 should be plausible under different determiners ("a" vs. "two")

 Example set:

    ate-apple
    drove-car

    "ate an apple" and "drove a car" are acceptable;
    "ate a car" and "drove an apple" yield semantic violations
 """

 from collections import Counter
 import itertools
 import sys


 VOWELS = ("a", "e", "i", "o", "u")


 def iter_candidates(verb_arg_list, verbs, args):
    """
    Yield a pair of verb-noun pairs satisfying the conditions.
    """
    candidate_v1_n1, candidate_v2_n2 = [], []
    for (verb, arg), freq in verb_arg_list.most_common():
        if arg.startswith(VOWELS):
            candidate_v1_n1.append((verb, arg, freq))
        else:
            candidate_v2_n2.append((verb, arg, freq))

    for (v1, n1, f1), (v2, n2, f2) in itertools.product(candidate_v1_n1, candidate_v2_n2):
        if v1 == v2:
            continue

        # Bound relative frequency.
        if f1 / f2 > 2 or f2 / f1 > 2:
            continue

        # TODO control for n1 vs. n2 frequency?

        # Ensure that (v1, n2) and (v2, n1) are not attested
        if verb_arg_list[v1, n2] > 0 or verb_arg_list[v2, n1] > 0:
            continue

        # It's a candidate!
        yield (v1, n1, f1), (v2, n2, f2)


 def main(verb_arg_list):
    # Compute marginal frequencies for all verbs + arguments.
    verbs, args = Counter(), Counter()
    for (verb, arg), freq in verb_arg_list.items():
        verbs[verb] += freq
        args[arg] += freq

    for (v1, n1, f1), (v2, n2, f2) in iter_candidates(verb_arg_list, verbs, args):
        print("%s\t%s\t%i\t%s\t%s\t%i\t%s %s\t%s %s" % (v1, n1, f1, v2, n2, f2, v1, n2, v2, n1))


 if __name__ == '__main__':
    verb_arg_file = sys.argv[1]
    verb_arg_list = Counter()
    with open(verb_arg_file, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                verb, arg, freq = line.split("\t")
                freq = int(freq)
                verb_arg_list[verb, arg] = freq

    main(verb_arg_list)
	"""
	Calculate statistics on verb-argument pairings given a parsed corpus
	of CoNLL-U-formatted files. Part-of-speech tags and dependency heads+labels
	are required.
	"""

	from collections import Counter
	from pathlib import Path
	import re
	import sys

	from tqdm import tqdm


	object_stopwords = frozenset([
	"what",
	"thing",
	"something",
	"anything",
	"nothing",
	"everything",
	"anyone",
	"everyone",
	"someone",
	"nobody",
	"nowhere",
	"anywhere",
	"everywhere",
	"anytime",
	"other",
	])

	# Exclude arguments which are likely to be non-countable / unlikely to accept
	# an indefinite
	exclude_re = re.compile("ness$\|ssion$")


	def consume_sentence(vocab, sentence):
	"""
	Update vocabulary given the new sentence.
	"""
	# Find root verb.
	try:
	root_idx, root_word = next((i, word) for i, (word, _, _, deprel) in enumerate(sentence)
	if deprel == "ROOT")
	except:
	# No root in sentence. Do nothing.
	return

	# Find direct object.
	try:
	dobj_word = next(word for word, pos, head, deprel in sentence
	if pos == "NOUN" and head == root_idx and deprel == "dobj"
	and word not in object_stopwords
	and not exclude_re.search(word))
	except:
	return

	root_word = root_word.lower()
	dobj_word = dobj_word.lower()
	vocab[root_word, dobj_word] += 1



	def main(corpus_dir):
	vocab = Counter()
	files = list(Path(corpus_dir).glob("*/.txt"))

	for doc in tqdm(files, desc="Reading files"):
	tqdm.write("Vocab size so far: %i" % len(vocab), file=sys.stderr)
	with doc.open("r") as doc_f:
	sentence = []
	for line in doc_f:
	line = line.strip()
	if line:
	fields = line.split("\t")
	try:
	lem = fields[2].lower()
	pos = fields[3]
	head = int(fields[6]) - 1
	deprel = fields[7]
	except:
	continue
	else:
	sentence.append((lem, pos, head, deprel))
	else:
	consume_sentence(vocab, sentence)
	sentence = []

	for (verb, obj), count in vocab.most_common():
	print("%s\t%s\t%i" % (verb, obj, count))


	if __name__ == '__main__':
	main(sys.argv[1])
	"""
	Search for tuples of nouns and verbs (n1, n2, v1, v2) satisfying the following constraints:

	1. n1 begins with a vowel
	2. n2 begins with a consonant
	3. v1-n1 is attested and v2-n2 is attested
	3a. Relative frequencies are similar -- 1/2 < freq(v1-n1) / freq(v2-n2) < 2
	4. [requires human] v1-n2 and v2-n1 yield a lexical-semantic violation (==> v1 and v2 have nontrivial selectional preferences)
	5. [requires human] n1 and n2 must have different singular and plural wordforms
	6. [requires human] n1 and n2 should be plausible under different determiners ("a" vs. "two")

	Example set:

	ate-apple
	drove-car

	"ate an apple" and "drove a car" are acceptable;
	"ate a car" and "drove an apple" yield semantic violations
	"""

	from collections import Counter
	import itertools
	import sys


	VOWELS = ("a", "e", "i", "o", "u")


	def iter_candidates(verb_arg_list, verbs, args):
	"""
	Yield a pair of verb-noun pairs satisfying the conditions.
	"""
	candidate_v1_n1, candidate_v2_n2 = [], []
	for (verb, arg), freq in verb_arg_list.most_common():
	if arg.startswith(VOWELS):
	candidate_v1_n1.append((verb, arg, freq))
	else:
	candidate_v2_n2.append((verb, arg, freq))

	for (v1, n1, f1), (v2, n2, f2) in itertools.product(candidate_v1_n1, candidate_v2_n2):
	if v1 == v2:
	continue

	# Bound relative frequency.
	if f1 / f2 > 2 or f2 / f1 > 2:
	continue

	# TODO control for n1 vs. n2 frequency?

	# Ensure that (v1, n2) and (v2, n1) are not attested
	if verb_arg_list[v1, n2] > 0 or verb_arg_list[v2, n1] > 0:
	continue

	# It's a candidate!
	yield (v1, n1, f1), (v2, n2, f2)


	def main(verb_arg_list):
	# Compute marginal frequencies for all verbs + arguments.
	verbs, args = Counter(), Counter()
	for (verb, arg), freq in verb_arg_list.items():
	verbs[verb] += freq
	args[arg] += freq

	for (v1, n1, f1), (v2, n2, f2) in iter_candidates(verb_arg_list, verbs, args):
	print("%s\t%s\t%i\t%s\t%s\t%i\t%s %s\t%s %s" % (v1, n1, f1, v2, n2, f2, v1, n2, v2, n1))


	if __name__ == '__main__':
	verb_arg_file = sys.argv[1]
	verb_arg_list = Counter()
	with open(verb_arg_file, "r") as f:
	for line in f:
	line = line.strip()
	if line:
	verb, arg, freq = line.split("\t")
	freq = int(freq)
	verb_arg_list[verb, arg] = freq

	main(verb_arg_list)