Skip to content

Instantly share code, notes, and snippets.

@SandNerd
Created February 4, 2022 01:53
Show Gist options
  • Save SandNerd/a9de1f96fb8e047332fbfe23535f460f to your computer and use it in GitHub Desktop.
Save SandNerd/a9de1f96fb8e047332fbfe23535f460f to your computer and use it in GitHub Desktop.
Google Voice Takeout Parser
# Python script to parse google voice takeout messages into csv
# command line arguments
from bs4 import BeautifulSoup
import pandas as pd
import os
import argparse
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True,
help="path to exported Google Voice directory (has 'Voice' folder inside it)")
ap.add_argument("-o", "--output", required=False,
help="path to folder where CSV file will be saved. Google Voice directory is used if not supplied")
args = vars(ap.parse_args())
# print('input: {}'.format(args["input"]))
takeout_dir = args["input"]
exp_dir = args["output"] or takeout_dir
tko_calls_dir = os.path.join(takeout_dir, 'Voice', 'Calls')
message_df = pd.DataFrame(
columns=['Date', 'Time', 'From', 'Content', 'Type', 'Date (UTC)'])
# Function to get all text log files
def get_html_files(calls_dir):
file_list = []
for file in os.listdir(calls_dir):
if file.endswith(".html"):
file_list.append(os.path.join(calls_dir, file))
else:
continue
return file_list
# Function to process file
def parse_file(file_path):
basename, extension = os.path.splitext(os.path.basename(file_path))
number, msg_type, utc_date = basename.split(' - ')
soup = BeautifulSoup(open(file_path), 'html.parser')
def parse_sms_file(soup):
messages = soup.find_all('div', {'class': 'message'})
message_ct = len(messages)
for x in range(len(messages)):
try:
msg_1 = messages[x]
iter_date = msg_1.find_all('abbr')[0].text.split(
',')[0] + msg_1.find_all('abbr')[0].text.split(',')[1]
iter_time = msg_1.find_all('abbr')[0].text.split(',')[
2].split('\n')[0][1:]
# iter_number = msg_1.find_all('a')[0].text.replace('+', '')
iter_message = msg_1.find_all('q')[0].text
message_df.loc[len(message_df)] = [
iter_date,
iter_time,
number,
iter_message,
msg_type,
utc_date
]
except Exception as e:
print(e)
continue
def parse_voice_file(soup):
messages = soup.find_all('div', {'class': 'haudio'})
message_ct = len(messages)
for x in range(len(messages)):
try:
msg_1 = messages[x]
iter_date = msg_1.find_all('abbr')[0].text.split(
',')[0] + msg_1.find_all('abbr')[0].text.split(',')[1]
iter_time = msg_1.find_all('abbr')[0].text.split(',')[
2].split('\n')[0][1:]
# iter_message = msg_1.find_all('span')[4].text
if msg_type == 'Voicemail':
try:
iter_message = "Transcription: " + msg_1.find_all(
"span", class_="full-text")[0].text
except Exception as e:
iter_message = "NO TRANSCRIPTION. SEE AUDIO FILE"
print("Proabably no transcription:", file_path, e)
else:
try:
iter_message = "Call Duration: " + msg_1.find_all('abbr')[1].text[1:][:8]
except:
iter_message = ''
message_df.loc[len(message_df)] = [
iter_date,
iter_time,
number,
iter_message,
msg_type,
utc_date
]
except Exception as e:
print('File {} encountered: {}', file_path, e)
continue
if msg_type == 'Text':
parse_sms_file(soup)
else:
parse_voice_file(soup)
# Function to save dataframe
def save_df(df):
save_path = os.path.join(exp_dir, 'Message_Export.csv')
df.to_csv(save_path, index=False)
print('Message DF saved: {}'.format(save_path))
# Main function to process all files
def main():
html_files = get_html_files(tko_calls_dir)
for file in html_files:
parse_file(file)
save_df(message_df)
if __name__ == '__main__':
main()
@tomdavidson
Copy link

Did you have takeout files such as Group Conversation - 2020-09-07T16_25_43Z.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment