hans · October 16, 2018 22:15
diff --git a/fetch_utterances.py b/fetch_utterances.py
 from argparse import ArgumentParser
 from collections import namedtuple
 from pathlib import Path
 import re

 import pandas as pd


 UTT_RE = re.compile(r"\*([A-Z]+):\s*(.+)\s*\x15(\d+)_(\d+)\x15$")
 TAG_RE = re.compile(r"([a-z:]+)\|(\w+)")

 Utterance = namedtuple("Utterance", ["start", "end", "speaker", "tokens", "lemmas", "tags"])


 def process_utterance(utt):
    """
    Extract utterance data from CHILDES transcript.
    """
    speaker, tokens, t_start, t_end, tags = None, None, None, None, None
    for line in utt:
        utt_data = re.findall(UTT_RE, line)
        if utt_data:
            speaker, tokens, t_start, t_end = utt_data[0]
            t_start = int(t_start)
            t_end = int(t_end)
        elif line.startswith("%mor"):
            tags = re.findall(TAG_RE, line)

    if tokens is None:
        return
    tokens = tokens.strip(" .!?").split()
    lemmas = None
    if tags is not None:
        lemmas = [lemma for _, lemma in tags]
        tags = [tag for tag, _ in tags]

    return Utterance(t_start, t_end, speaker, tokens, lemmas, tags)


 def parse_cha_file(stream):
    utterances, utt = [], []
    for line in stream.open("r"):
        line = line.strip()
        if not line or line.startswith("@"):
            continue

        if line.startswith("*"):
            # Process built-up utterance.
            if utt:
                utt = process_utterance(utt)
                if utt is not None:
                    utterances.append(utt)
            utt = []

        utt.append(line)

    # Ensure sorting by utterance start.
    utterances = list(sorted(utterances, key=lambda u: u.start))
    return utterances


 if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument("cha_file", type=Path, help="Path to CHILDES .cha transcript file")
    p.add_argument("annot_file", type=Path, help="Path to Hartshorne annotation file")

    args = p.parse_args()
    utterances = parse_cha_file(args.cha_file)
    events = pd.read_csv(args.annot_file)

    def extract_ts(t_string):
        """
        Compute millisecond bounds from Hartshorne annotations so that we can
        compare directly with the bounds in the CHILDES transcripts.
        """
        if t_string.startswith("Full Video"):
            # inconsistent annotators..
            t_start_m, t_end_m = re.match("^Full Video - (\d+)-(\d+)$", t_string).groups()
            t_start_s, t_end_s = "0", "0"
        else:
            t_start_m, t_start_s, t_end_m, t_end_s = \
                    re.search(r"(\d+):(\d+)-(\d+):(\d+)", t_string).groups()
        return int(t_start_m, 10) * 60 * 1000 + int(t_start_s, 10) * 1000, \
                int(t_end_m, 10) * 60 * 1000 + int(t_end_s, 10) * 1000
    events["t_start"], events["t_end"] = \
            zip(*events["Time"].map(extract_ts))

    # Extract all observed time ranges; for each time range, update the
    # relevant rows with the verbs spoken in that time range.
    range_groups = events.groupby(["t_start", "t_end"])
    for (t_start, t_end), group in range_groups:
        nouns, verbs = [], []
        for utt in utterances:
            if utt.start < t_start:
                continue
            elif utt.start > t_end:
                break

            # => t_start <= utt.start <= t_end
            if utt.lemmas is not None and utt.tags is not None:
                nouns.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags)
                              if tag == "n"])
                verbs.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags)
                              if tag == "v"])

        nouns_str = " ".join(sorted(nouns))
        verbs_str = " ".join(sorted(verbs))
        events.loc[group.index, "utt_nouns"] = nouns_str
        events.loc[group.index, "utt_verbs"] = verbs_str

    # Exclude events with no utterances in their time range.
    events = events[events.utt_verbs != ""]

    # Exclude events with null labels.
    events = events[events.Verb.notnull()]

    # How often does the glossed verb of an event appear within the utterances
    # of that event's time range?
    print("Proportion of event rows whose gloss appears among the relevant utterances:")
    print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean())

    events.to_csv("test.csv")
 oes the glossed verb of an event appear within the utterances
    # of that event's time range?
    print("Proportion of event rows whose gloss appears among the relevant utterances:")
    print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean())

    events.to_csv("test.csv")
	from argparse import ArgumentParser
	from collections import namedtuple
	from pathlib import Path
	import re

	import pandas as pd


	UTT_RE = re.compile(r"\([A-Z]+):\s(.+)\s*\x15(\d+)_(\d+)\x15$")
	TAG_RE = re.compile(r"([a-z:]+)\\|(\w+)")

	Utterance = namedtuple("Utterance", ["start", "end", "speaker", "tokens", "lemmas", "tags"])


	def process_utterance(utt):
	"""
	Extract utterance data from CHILDES transcript.
	"""
	speaker, tokens, t_start, t_end, tags = None, None, None, None, None
	for line in utt:
	utt_data = re.findall(UTT_RE, line)
	if utt_data:
	speaker, tokens, t_start, t_end = utt_data[0]
	t_start = int(t_start)
	t_end = int(t_end)
	elif line.startswith("%mor"):
	tags = re.findall(TAG_RE, line)

	if tokens is None:
	return
	tokens = tokens.strip(" .!?").split()
	lemmas = None
	if tags is not None:
	lemmas = [lemma for _, lemma in tags]
	tags = [tag for tag, _ in tags]

	return Utterance(t_start, t_end, speaker, tokens, lemmas, tags)


	def parse_cha_file(stream):
	utterances, utt = [], []
	for line in stream.open("r"):
	line = line.strip()
	if not line or line.startswith("@"):
	continue

	if line.startswith("*"):
	# Process built-up utterance.
	if utt:
	utt = process_utterance(utt)
	if utt is not None:
	utterances.append(utt)
	utt = []

	utt.append(line)

	# Ensure sorting by utterance start.
	utterances = list(sorted(utterances, key=lambda u: u.start))
	return utterances


	if __name__ == '__main__':
	p = ArgumentParser()
	p.add_argument("cha_file", type=Path, help="Path to CHILDES .cha transcript file")
	p.add_argument("annot_file", type=Path, help="Path to Hartshorne annotation file")

	args = p.parse_args()
	utterances = parse_cha_file(args.cha_file)
	events = pd.read_csv(args.annot_file)

	def extract_ts(t_string):
	"""
	Compute millisecond bounds from Hartshorne annotations so that we can
	compare directly with the bounds in the CHILDES transcripts.
	"""
	if t_string.startswith("Full Video"):
	# inconsistent annotators..
	t_start_m, t_end_m = re.match("^Full Video - (\d+)-(\d+)$", t_string).groups()
	t_start_s, t_end_s = "0", "0"
	else:
	t_start_m, t_start_s, t_end_m, t_end_s = \
	re.search(r"(\d+):(\d+)-(\d+):(\d+)", t_string).groups()
	return int(t_start_m, 10) * 60 * 1000 + int(t_start_s, 10) * 1000, \
	int(t_end_m, 10) * 60 * 1000 + int(t_end_s, 10) * 1000
	events["t_start"], events["t_end"] = \
	zip(*events["Time"].map(extract_ts))

	# Extract all observed time ranges; for each time range, update the
	# relevant rows with the verbs spoken in that time range.
	range_groups = events.groupby(["t_start", "t_end"])
	for (t_start, t_end), group in range_groups:
	nouns, verbs = [], []
	for utt in utterances:
	if utt.start < t_start:
	continue
	elif utt.start > t_end:
	break

	# => t_start <= utt.start <= t_end
	if utt.lemmas is not None and utt.tags is not None:
	nouns.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags)
	if tag == "n"])
	verbs.extend([lemma for lemma, tag in zip(utt.lemmas, utt.tags)
	if tag == "v"])

	nouns_str = " ".join(sorted(nouns))
	verbs_str = " ".join(sorted(verbs))
	events.loc[group.index, "utt_nouns"] = nouns_str
	events.loc[group.index, "utt_verbs"] = verbs_str

	# Exclude events with no utterances in their time range.
	events = events[events.utt_verbs != ""]

	# Exclude events with null labels.
	events = events[events.Verb.notnull()]

	# How often does the glossed verb of an event appear within the utterances
	# of that event's time range?
	print("Proportion of event rows whose gloss appears among the relevant utterances:")
	print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean())

	events.to_csv("test.csv")
	oes the glossed verb of an event appear within the utterances
	# of that event's time range?
	print("Proportion of event rows whose gloss appears among the relevant utterances:")
	print(events.apply(lambda r: r.Verb.lower() in r.utt_verbs, axis=1).mean())

	events.to_csv("test.csv")