Skip to content

Instantly share code, notes, and snippets.

@hans
Last active August 13, 2018 18:06
Show Gist options
  • Save hans/96b706dcdf62689120ebd2e899767b28 to your computer and use it in GitHub Desktop.
Save hans/96b706dcdf62689120ebd2e899767b28 to your computer and use it in GitHub Desktop.
MEG decoding study: Corpus search for candidate pairs of attested verb-noun combinations
"""
Calculate statistics on verb-argument pairings given a parsed corpus
of CoNLL-U-formatted files. Part-of-speech tags and dependency heads+labels
are required.
"""
from collections import Counter
from pathlib import Path
import re
import sys
from tqdm import tqdm
object_stopwords = frozenset([
"what",
"thing",
"something",
"anything",
"nothing",
"everything",
"anyone",
"everyone",
"someone",
"nobody",
"nowhere",
"anywhere",
"everywhere",
"anytime",
"other",
])
# Exclude arguments which are likely to be non-countable / unlikely to accept
# an indefinite
exclude_re = re.compile("ness$|ssion$")
def consume_sentence(vocab, sentence):
"""
Update vocabulary given the new sentence.
"""
# Find root verb.
try:
root_idx, root_word = next((i, word) for i, (word, _, _, deprel) in enumerate(sentence)
if deprel == "ROOT")
except:
# No root in sentence. Do nothing.
return
# Find direct object.
try:
dobj_word = next(word for word, pos, head, deprel in sentence
if pos == "NOUN" and head == root_idx and deprel == "dobj"
and word not in object_stopwords
and not exclude_re.search(word))
except:
return
root_word = root_word.lower()
dobj_word = dobj_word.lower()
vocab[root_word, dobj_word] += 1
def main(corpus_dir):
vocab = Counter()
files = list(Path(corpus_dir).glob("**/*.txt"))
for doc in tqdm(files, desc="Reading files"):
tqdm.write("Vocab size so far: %i" % len(vocab), file=sys.stderr)
with doc.open("r") as doc_f:
sentence = []
for line in doc_f:
line = line.strip()
if line:
fields = line.split("\t")
try:
lem = fields[2].lower()
pos = fields[3]
head = int(fields[6]) - 1
deprel = fields[7]
except:
continue
else:
sentence.append((lem, pos, head, deprel))
else:
consume_sentence(vocab, sentence)
sentence = []
for (verb, obj), count in vocab.most_common():
print("%s\t%s\t%i" % (verb, obj, count))
if __name__ == '__main__':
main(sys.argv[1])
from collections import Counter
from pathlib import Path
import sys
from tqdm import tqdm
def main(corpus_dir):
vocab = Counter()
files = list(Path(corpus_dir).glob("**/*.txt"))
for doc in tqdm(files, desc="Reading files"):
tqdm.write("Vocab size so far: %i" % len(vocab))
with doc.open("r") as doc_f:
for line in doc_f:
line = line.strip()
if line:
fields = line.split("\t")
try:
word = fields[1].lower()
except:
continue
else:
vocab[word] += 1
for token, count in vocab.most_common():
print("%s\t%i" % (token, count))
if __name__ == '__main__':
main(sys.argv[1])
"""
Search for tuples of nouns and verbs (n1, n2, v1, v2) satisfying the following constraints:
1. n1 begins with a vowel
2. n2 begins with a consonant
3. v1-n1 is attested and v2-n2 is attested
3a. Relative frequencies are similar -- 1/2 < freq(v1-n1) / freq(v2-n2) < 2
4. [requires human] v1-n2 and v2-n1 yield a lexical-semantic violation (==> v1 and v2 have nontrivial selectional preferences)
5. [requires human] n1 and n2 must have different singular and plural wordforms
6. [requires human] n1 and n2 should be plausible under different determiners ("a" vs. "two")
Example set:
ate-apple
drove-car
"ate an apple" and "drove a car" are acceptable;
"ate a car" and "drove an apple" yield semantic violations
"""
from collections import Counter
import itertools
import sys
VOWELS = ("a", "e", "i", "o", "u")
def iter_candidates(verb_arg_list, verbs, args):
"""
Yield a pair of verb-noun pairs satisfying the conditions.
"""
candidate_v1_n1, candidate_v2_n2 = [], []
for (verb, arg), freq in verb_arg_list.most_common():
if arg.startswith(VOWELS):
candidate_v1_n1.append((verb, arg, freq))
else:
candidate_v2_n2.append((verb, arg, freq))
for (v1, n1, f1), (v2, n2, f2) in itertools.product(candidate_v1_n1, candidate_v2_n2):
if v1 == v2:
continue
# Bound relative frequency.
if f1 / f2 > 2 or f2 / f1 > 2:
continue
# TODO control for n1 vs. n2 frequency?
# Ensure that (v1, n2) and (v2, n1) are not attested
if verb_arg_list[v1, n2] > 0 or verb_arg_list[v2, n1] > 0:
continue
# It's a candidate!
yield (v1, n1, f1), (v2, n2, f2)
def main(verb_arg_list):
# Compute marginal frequencies for all verbs + arguments.
verbs, args = Counter(), Counter()
for (verb, arg), freq in verb_arg_list.items():
verbs[verb] += freq
args[arg] += freq
for (v1, n1, f1), (v2, n2, f2) in iter_candidates(verb_arg_list, verbs, args):
print("%s\t%s\t%i\t%s\t%s\t%i\t%s %s\t%s %s" % (v1, n1, f1, v2, n2, f2, v1, n2, v2, n1))
if __name__ == '__main__':
verb_arg_file = sys.argv[1]
verb_arg_list = Counter()
with open(verb_arg_file, "r") as f:
for line in f:
line = line.strip()
if line:
verb, arg, freq = line.split("\t")
freq = int(freq)
verb_arg_list[verb, arg] = freq
main(verb_arg_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment