Last active
August 13, 2018 18:06
-
-
Save hans/96b706dcdf62689120ebd2e899767b28 to your computer and use it in GitHub Desktop.
MEG decoding study: Corpus search for candidate pairs of attested verb-noun combinations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Calculate statistics on verb-argument pairings given a parsed corpus | |
of CoNLL-U-formatted files. Part-of-speech tags and dependency heads+labels | |
are required. | |
""" | |
from collections import Counter | |
from pathlib import Path | |
import re | |
import sys | |
from tqdm import tqdm | |
object_stopwords = frozenset([ | |
"what", | |
"thing", | |
"something", | |
"anything", | |
"nothing", | |
"everything", | |
"anyone", | |
"everyone", | |
"someone", | |
"nobody", | |
"nowhere", | |
"anywhere", | |
"everywhere", | |
"anytime", | |
"other", | |
]) | |
# Exclude arguments which are likely to be non-countable / unlikely to accept | |
# an indefinite | |
exclude_re = re.compile("ness$|ssion$") | |
def consume_sentence(vocab, sentence): | |
""" | |
Update vocabulary given the new sentence. | |
""" | |
# Find root verb. | |
try: | |
root_idx, root_word = next((i, word) for i, (word, _, _, deprel) in enumerate(sentence) | |
if deprel == "ROOT") | |
except: | |
# No root in sentence. Do nothing. | |
return | |
# Find direct object. | |
try: | |
dobj_word = next(word for word, pos, head, deprel in sentence | |
if pos == "NOUN" and head == root_idx and deprel == "dobj" | |
and word not in object_stopwords | |
and not exclude_re.search(word)) | |
except: | |
return | |
root_word = root_word.lower() | |
dobj_word = dobj_word.lower() | |
vocab[root_word, dobj_word] += 1 | |
def main(corpus_dir): | |
vocab = Counter() | |
files = list(Path(corpus_dir).glob("**/*.txt")) | |
for doc in tqdm(files, desc="Reading files"): | |
tqdm.write("Vocab size so far: %i" % len(vocab), file=sys.stderr) | |
with doc.open("r") as doc_f: | |
sentence = [] | |
for line in doc_f: | |
line = line.strip() | |
if line: | |
fields = line.split("\t") | |
try: | |
lem = fields[2].lower() | |
pos = fields[3] | |
head = int(fields[6]) - 1 | |
deprel = fields[7] | |
except: | |
continue | |
else: | |
sentence.append((lem, pos, head, deprel)) | |
else: | |
consume_sentence(vocab, sentence) | |
sentence = [] | |
for (verb, obj), count in vocab.most_common(): | |
print("%s\t%s\t%i" % (verb, obj, count)) | |
if __name__ == '__main__': | |
main(sys.argv[1]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from pathlib import Path | |
import sys | |
from tqdm import tqdm | |
def main(corpus_dir): | |
vocab = Counter() | |
files = list(Path(corpus_dir).glob("**/*.txt")) | |
for doc in tqdm(files, desc="Reading files"): | |
tqdm.write("Vocab size so far: %i" % len(vocab)) | |
with doc.open("r") as doc_f: | |
for line in doc_f: | |
line = line.strip() | |
if line: | |
fields = line.split("\t") | |
try: | |
word = fields[1].lower() | |
except: | |
continue | |
else: | |
vocab[word] += 1 | |
for token, count in vocab.most_common(): | |
print("%s\t%i" % (token, count)) | |
if __name__ == '__main__': | |
main(sys.argv[1]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Search for tuples of nouns and verbs (n1, n2, v1, v2) satisfying the following constraints: | |
1. n1 begins with a vowel | |
2. n2 begins with a consonant | |
3. v1-n1 is attested and v2-n2 is attested | |
3a. Relative frequencies are similar -- 1/2 < freq(v1-n1) / freq(v2-n2) < 2 | |
4. [requires human] v1-n2 and v2-n1 yield a lexical-semantic violation (==> v1 and v2 have nontrivial selectional preferences) | |
5. [requires human] n1 and n2 must have different singular and plural wordforms | |
6. [requires human] n1 and n2 should be plausible under different determiners ("a" vs. "two") | |
Example set: | |
ate-apple | |
drove-car | |
"ate an apple" and "drove a car" are acceptable; | |
"ate a car" and "drove an apple" yield semantic violations | |
""" | |
from collections import Counter | |
import itertools | |
import sys | |
VOWELS = ("a", "e", "i", "o", "u") | |
def iter_candidates(verb_arg_list, verbs, args): | |
""" | |
Yield a pair of verb-noun pairs satisfying the conditions. | |
""" | |
candidate_v1_n1, candidate_v2_n2 = [], [] | |
for (verb, arg), freq in verb_arg_list.most_common(): | |
if arg.startswith(VOWELS): | |
candidate_v1_n1.append((verb, arg, freq)) | |
else: | |
candidate_v2_n2.append((verb, arg, freq)) | |
for (v1, n1, f1), (v2, n2, f2) in itertools.product(candidate_v1_n1, candidate_v2_n2): | |
if v1 == v2: | |
continue | |
# Bound relative frequency. | |
if f1 / f2 > 2 or f2 / f1 > 2: | |
continue | |
# TODO control for n1 vs. n2 frequency? | |
# Ensure that (v1, n2) and (v2, n1) are not attested | |
if verb_arg_list[v1, n2] > 0 or verb_arg_list[v2, n1] > 0: | |
continue | |
# It's a candidate! | |
yield (v1, n1, f1), (v2, n2, f2) | |
def main(verb_arg_list): | |
# Compute marginal frequencies for all verbs + arguments. | |
verbs, args = Counter(), Counter() | |
for (verb, arg), freq in verb_arg_list.items(): | |
verbs[verb] += freq | |
args[arg] += freq | |
for (v1, n1, f1), (v2, n2, f2) in iter_candidates(verb_arg_list, verbs, args): | |
print("%s\t%s\t%i\t%s\t%s\t%i\t%s %s\t%s %s" % (v1, n1, f1, v2, n2, f2, v1, n2, v2, n1)) | |
if __name__ == '__main__': | |
verb_arg_file = sys.argv[1] | |
verb_arg_list = Counter() | |
with open(verb_arg_file, "r") as f: | |
for line in f: | |
line = line.strip() | |
if line: | |
verb, arg, freq = line.split("\t") | |
freq = int(freq) | |
verb_arg_list[verb, arg] = freq | |
main(verb_arg_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment