Skip to content

Instantly share code, notes, and snippets.

@hans
Created October 16, 2018 22:15
Show Gist options
  • Save hans/1b8ee7b5cb044dba71793e1b93c1ac47 to your computer and use it in GitHub Desktop.
Save hans/1b8ee7b5cb044dba71793e1b93c1ac47 to your computer and use it in GitHub Desktop.
Merge Hartshorne event annotations with CHILDES transcripts and do some initial analysis
from argparse import ArgumentParser
from collections import namedtuple
from pathlib import Path
import re
import pandas as pd
UTT_RE = re.compile(r"\*([A-Z]+):\s*(.+)\s*\x15(\d+)_(\d+)\x15$")
TAG_RE = re.compile(r"([a-z:]+)\|(\w+)")
Utterance = namedtuple("Utterance", ["start", "end", "speaker", "tokens", "lemmas", "tags"])
def process_utterance(utt):
"""
Extract utterance data from CHILDES transcript.
"""
speaker, tokens, t_start, t_end, tags = None, None, None, None, None
for line in utt:
utt_data = re.findall(UTT_RE, line)
if utt_data:
speaker, tokens, t_start, t_end = utt_data[0]
t_start = int(t_start)
t_end = int(t_end)
elif line.startswith("%mor"):
tags = re.findall(TAG_RE, line)
if tokens is None:
return
tokens = tokens.strip(" .!?").split()
lemmas = None
if tags is not None:
lemmas = [lemma for _, lemma in tags]
tags = [tag for tag, _ in tags]
return Utterance(t_start, t_end, speaker, tokens, lemmas, tags)
def parse_cha_file(stream):
utterances, utt = [], []
for line in stream.open("r"):
line = line.strip()
if not line or line.startswith("@"):
continue
if line.startswith("*"):
# Process built-up utterance.
if utt:
utt = process_utterance(utt)
if utt is not None:
utterances.append(utt)
utt = []
utt.append(line)
# Ensure sorting by utterance start.
utterances = list(sorted(utterances, key=lambda u: u.start))
return utterances
if __name__ == '__main__':
p = ArgumentParser()
p.add_argument("cha_file", type=Path, help="Path to CHILDES .cha transcript file")
p.add_argument("annot_file", type=Path, help="Path to Hartshorne annotation file")
args = p.parse_args()
utterances = parse_cha_file(args.cha_file)
events = pd.read_csv(args.annot_file)
def extract_ts(t_string):
"""
Compute millisecond bounds from Hartshorne annotations so that we can
compare directly with the bounds in the CHILDES transcripts.
"""
if t_string.startswith("Full Video"):
# inconsistent annotators..
t_start_m, t_end_m = re.match("^Full Video - (\d+)-(\d+)$", t_string).groups()
t_start_s, t_end_s = "0", "0"
else:
t_start_m, t_start_s, t_end_m, t_end_s = \
re.search(r"(\d+):(\d+)-(\d+):(\d+)", t_string).groups()
return int(t_start_m, 10) * 60 * 1000 + int(t_start_s, 10) * 1000, \
int(t_end_m, 10) * 60 * 1000 + int(t_end_s, 10) * 1000
events["t_start"], events["t_end"] = \
zip(*events["Time"].map(extract_ts))
# Extract all observed time ranges; for each time range, update the
# relevant rows with the verbs spoken in that time range.
range_groups = events.groupby(["t_start", "t_end"])
for (t_start, t_end), group in range_groups:
nouns, verbs = [], []
for utt in utterances:
if utt.start < t_start:
continue
elif utt.start > t_end:
break
# => t_start <= utt.start <= t_end
if utt.lemmas is not None and utt.tags is not None:
nouns.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags)
if tag == "n"])
verbs.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags)
if tag == "v"])
nouns_str = " ".join(sorted(nouns))
verbs_str = " ".join(sorted(verbs))
events.loc[group.index, "utt_nouns"] = nouns_str
events.loc[group.index, "utt_verbs"] = verbs_str
# Exclude events with no utterances in their time range.
events = events[events.utt_verbs != ""]
# Exclude events with null labels.
events = events[events.Verb.notnull()]
# How often does the glossed verb of an event appear within the utterances
# of that event's time range?
print("Proportion of event rows whose gloss appears among the relevant utterances:")
print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean())
events.to_csv("test.csv")
oes the glossed verb of an event appear within the utterances
# of that event's time range?
print("Proportion of event rows whose gloss appears among the relevant utterances:")
print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean())
events.to_csv("test.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment