Last active
June 17, 2021 00:16
-
-
Save stuhlmueller/de54e8d3f4d8522d1095dc5f05aa36f3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Converts a directory with Rev.com transcripts into the dataset format required by Elicit. | |
Format for transcripts: | |
https://www.rev.com/transcript-editor/Edit?token=12312 | |
Jane Doe: | |
Good morning. Thank you everyone for joining our webinar. My name is Jon Doe. | |
Jon Doe: | |
Hi. My name is Jon Doe. | |
This is the usual Rev.com export format except the first line contains the URL of the transcript. | |
""" | |
import glob | |
import os | |
transcripts_path = "./transcripts/*.txt" | |
for filename in glob.glob(transcripts_path): | |
with open(filename) as file: | |
text = file.read() | |
source_url = text.split("\n")[0] | |
title = os.path.basename(filename)[:-4] | |
messages = text.split("\n\n") | |
for message in messages[1:]: | |
lines = message.split("\n") | |
first_line = lines[0] | |
if not first_line: | |
continue | |
assert first_line.endswith(":"), first_line | |
participant = first_line[:-1] | |
contents = lines[1:] | |
for content in contents: | |
content = content.strip() | |
if content: | |
print(f"\nMessage: {content}") | |
print(f"Participant: {participant}") | |
print(f"Source: [{title}]({source_url})") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment