Created
October 16, 2018 22:15
-
-
Save hans/1b8ee7b5cb044dba71793e1b93c1ac47 to your computer and use it in GitHub Desktop.
Merge Hartshorne event annotations with CHILDES transcripts and do some initial analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from argparse import ArgumentParser | |
from collections import namedtuple | |
from pathlib import Path | |
import re | |
import pandas as pd | |
UTT_RE = re.compile(r"\*([A-Z]+):\s*(.+)\s*\x15(\d+)_(\d+)\x15$") | |
TAG_RE = re.compile(r"([a-z:]+)\|(\w+)") | |
Utterance = namedtuple("Utterance", ["start", "end", "speaker", "tokens", "lemmas", "tags"]) | |
def process_utterance(utt): | |
""" | |
Extract utterance data from CHILDES transcript. | |
""" | |
speaker, tokens, t_start, t_end, tags = None, None, None, None, None | |
for line in utt: | |
utt_data = re.findall(UTT_RE, line) | |
if utt_data: | |
speaker, tokens, t_start, t_end = utt_data[0] | |
t_start = int(t_start) | |
t_end = int(t_end) | |
elif line.startswith("%mor"): | |
tags = re.findall(TAG_RE, line) | |
if tokens is None: | |
return | |
tokens = tokens.strip(" .!?").split() | |
lemmas = None | |
if tags is not None: | |
lemmas = [lemma for _, lemma in tags] | |
tags = [tag for tag, _ in tags] | |
return Utterance(t_start, t_end, speaker, tokens, lemmas, tags) | |
def parse_cha_file(stream): | |
utterances, utt = [], [] | |
for line in stream.open("r"): | |
line = line.strip() | |
if not line or line.startswith("@"): | |
continue | |
if line.startswith("*"): | |
# Process built-up utterance. | |
if utt: | |
utt = process_utterance(utt) | |
if utt is not None: | |
utterances.append(utt) | |
utt = [] | |
utt.append(line) | |
# Ensure sorting by utterance start. | |
utterances = list(sorted(utterances, key=lambda u: u.start)) | |
return utterances | |
if __name__ == '__main__': | |
p = ArgumentParser() | |
p.add_argument("cha_file", type=Path, help="Path to CHILDES .cha transcript file") | |
p.add_argument("annot_file", type=Path, help="Path to Hartshorne annotation file") | |
args = p.parse_args() | |
utterances = parse_cha_file(args.cha_file) | |
events = pd.read_csv(args.annot_file) | |
def extract_ts(t_string): | |
""" | |
Compute millisecond bounds from Hartshorne annotations so that we can | |
compare directly with the bounds in the CHILDES transcripts. | |
""" | |
if t_string.startswith("Full Video"): | |
# inconsistent annotators.. | |
t_start_m, t_end_m = re.match("^Full Video - (\d+)-(\d+)$", t_string).groups() | |
t_start_s, t_end_s = "0", "0" | |
else: | |
t_start_m, t_start_s, t_end_m, t_end_s = \ | |
re.search(r"(\d+):(\d+)-(\d+):(\d+)", t_string).groups() | |
return int(t_start_m, 10) * 60 * 1000 + int(t_start_s, 10) * 1000, \ | |
int(t_end_m, 10) * 60 * 1000 + int(t_end_s, 10) * 1000 | |
events["t_start"], events["t_end"] = \ | |
zip(*events["Time"].map(extract_ts)) | |
# Extract all observed time ranges; for each time range, update the | |
# relevant rows with the verbs spoken in that time range. | |
range_groups = events.groupby(["t_start", "t_end"]) | |
for (t_start, t_end), group in range_groups: | |
nouns, verbs = [], [] | |
for utt in utterances: | |
if utt.start < t_start: | |
continue | |
elif utt.start > t_end: | |
break | |
# => t_start <= utt.start <= t_end | |
if utt.lemmas is not None and utt.tags is not None: | |
nouns.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags) | |
if tag == "n"]) | |
verbs.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags) | |
if tag == "v"]) | |
nouns_str = " ".join(sorted(nouns)) | |
verbs_str = " ".join(sorted(verbs)) | |
events.loc[group.index, "utt_nouns"] = nouns_str | |
events.loc[group.index, "utt_verbs"] = verbs_str | |
# Exclude events with no utterances in their time range. | |
events = events[events.utt_verbs != ""] | |
# Exclude events with null labels. | |
events = events[events.Verb.notnull()] | |
# How often does the glossed verb of an event appear within the utterances | |
# of that event's time range? | |
print("Proportion of event rows whose gloss appears among the relevant utterances:") | |
print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean()) | |
events.to_csv("test.csv") | |
oes the glossed verb of an event appear within the utterances | |
# of that event's time range? | |
print("Proportion of event rows whose gloss appears among the relevant utterances:") | |
print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean()) | |
events.to_csv("test.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment