Last active
September 15, 2021 20:24
-
-
Save th0rgall/d627140fc072725d098363604d239923 to your computer and use it in GitHub Desktop.
Convert transcripts exported by Otter.ai to a format importable by Atlas TI v8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
author: Thor Galle <[email protected]> | |
original version: April 9, 2020 | |
last update: April 9, 2020 | |
A script to convert a .txt transcript exported from https://otter.ai/ to a format importable in Atlas TI v8. | |
Based on a similar script for oTranscribe I made on April 3, 2020. | |
USAGE | |
===== | |
A. Export your transcript in Otter.io using these settings: | |
- Export format: .txt | |
- Include speaker names: yes | |
- Include timestamps: yes | |
- Merge same-speaker segments: no | |
- Export as monologue: no | |
B. Convert the downloaded file from Otter.io with this script using the CLI: | |
python convert-otter.py <input_file> [output_file] | |
[output_file] defaults to out.txt | |
C. The output file should be importable in Atlas TI | |
EXAMPLE | |
======= | |
Sample text before (input) | |
-------------------------- | |
Thor Galle 0:49 | |
Okay, that's a good point already. Like, good. Good. Good remark. Yeah, I see only four videos. That's all | |
Unknown Speaker 1:02 | |
So | |
Sample text after (output) | |
-------------------------- | |
[00:00:49]Thor Galle: Okay, that’s a good point already. Like, good. Good. Good remark. Yeah, I see only four videos. That’s all | |
[00:01:02]Unknown Speaker: So | |
HOW DOES THIS WORK? | |
==================== | |
See this page to understand why I made this script, and why the output looks the way it does: | |
https://www.notion.so/thorgalle/Importing-plain-text-transcripts-into-Atlas-TI-3ee3c89ce0f94c2c9199e2e1711e57b4 | |
TODO | |
==== | |
- support for transcript of an hour or longer (00:00:00 format?) | |
- better support for double quotes (they're now converted to single quotes) | |
''' | |
import sys, re | |
TYPOGRAPHIC_SINGLE_QUOTE = '’' | |
TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '“' | |
TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '”' | |
OUTPUT_DEFAULT = 'output.txt' | |
# get the input & output files | |
def getFileNames(): | |
inputFile = None | |
outputFile = OUTPUT_DEFAULT | |
if (len(sys.argv) > 1): | |
inputFile = sys.argv[1] | |
if (len(sys.argv) > 2): | |
outputFile = sys.argv[2] | |
return (inputFile, outputFile) | |
# rules for replacing text parts | |
def getReplaceTuples(): | |
singleQuoteR = (re.compile(r"'"), r'%s' % TYPOGRAPHIC_SINGLE_QUOTE) | |
# TODO: this should converting to double quotes, but that requires more complex logic to detect start/end of a double quotation | |
# and dealing with edge cases | |
doubleQuoteR = (re.compile(r'"'), TYPOGRAPHIC_SINGLE_QUOTE) | |
ampersandR = (re.compile(r"\s?&\s?"), r' and ') | |
regexes = [singleQuoteR, doubleQuoteR, ampersandR] | |
return regexes | |
def flattenOtterTranscript(lines): | |
acc = [] | |
for index in range (len(lines)): | |
lineType = index % 3 | |
if (lineType == 0): # name + timestamp | |
## add new element | |
## TODO: hour support | |
acc.append("[00:%s]%s: " % getTimeStampAndPrefix(lines[index])) | |
elif (lineType == 1): # subtitle line | |
## add to last element | |
acc[len(acc) - 1] += replaceLine(lines[index]) | |
# else: leave out (empty line) | |
return acc | |
# pad 2:01 to 02:00 | |
def padTime(timeStr): | |
oneChar = lambda x: len(x) == 1 | |
return ":".join([ ("0" + part if oneChar(part) else part) for part in timeStr.split(":")]) | |
# construct the time stamp and name prefix for a subtitle | |
def getTimeStampAndPrefix(line): | |
timeStampsR = re.compile(r'^([\w\s]+?\b)\s*((?:\d{1,2}:)?\d{1,2}:\d{2})', re.IGNORECASE) | |
match = timeStampsR.match(line) | |
if match: | |
# returns in the format ('0:49', 'Thor Galle') | |
return (padTime(match.group(2)), match.group(1)) | |
else: | |
print("Error: couldn't parse line") | |
# replaces characters that cause problems for the Atlas TI importer with valid variants | |
def replaceLine(line): | |
outLine = line | |
for i in getReplaceTuples(): | |
outLine = i[0].sub(i[1], outLine) | |
return outLine | |
def main(): | |
(inputFile, outputFile) = getFileNames() | |
with open(inputFile) as f: | |
content = [line.strip() for line in f.readlines()] | |
## prune last two lines with the irregular "Transcribed by https://otter.ai" | |
content = flattenOtterTranscript(content[:-2]) | |
with open(outputFile, 'w') as fo: | |
fo.writelines("%s\n" % l for l in content) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment