Created
February 4, 2022 01:53
-
-
Save SandNerd/a9de1f96fb8e047332fbfe23535f460f to your computer and use it in GitHub Desktop.
Google Voice Takeout Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python script to parse google voice takeout messages into csv | |
# command line arguments | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import os | |
import argparse | |
# construct the argument parser and parse the arguments | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-i", "--input", required=True, | |
help="path to exported Google Voice directory (has 'Voice' folder inside it)") | |
ap.add_argument("-o", "--output", required=False, | |
help="path to folder where CSV file will be saved. Google Voice directory is used if not supplied") | |
args = vars(ap.parse_args()) | |
# print('input: {}'.format(args["input"])) | |
takeout_dir = args["input"] | |
exp_dir = args["output"] or takeout_dir | |
tko_calls_dir = os.path.join(takeout_dir, 'Voice', 'Calls') | |
message_df = pd.DataFrame( | |
columns=['Date', 'Time', 'From', 'Content', 'Type', 'Date (UTC)']) | |
# Function to get all text log files | |
def get_html_files(calls_dir): | |
file_list = [] | |
for file in os.listdir(calls_dir): | |
if file.endswith(".html"): | |
file_list.append(os.path.join(calls_dir, file)) | |
else: | |
continue | |
return file_list | |
# Function to process file | |
def parse_file(file_path): | |
basename, extension = os.path.splitext(os.path.basename(file_path)) | |
number, msg_type, utc_date = basename.split(' - ') | |
soup = BeautifulSoup(open(file_path), 'html.parser') | |
def parse_sms_file(soup): | |
messages = soup.find_all('div', {'class': 'message'}) | |
message_ct = len(messages) | |
for x in range(len(messages)): | |
try: | |
msg_1 = messages[x] | |
iter_date = msg_1.find_all('abbr')[0].text.split( | |
',')[0] + msg_1.find_all('abbr')[0].text.split(',')[1] | |
iter_time = msg_1.find_all('abbr')[0].text.split(',')[ | |
2].split('\n')[0][1:] | |
# iter_number = msg_1.find_all('a')[0].text.replace('+', '') | |
iter_message = msg_1.find_all('q')[0].text | |
message_df.loc[len(message_df)] = [ | |
iter_date, | |
iter_time, | |
number, | |
iter_message, | |
msg_type, | |
utc_date | |
] | |
except Exception as e: | |
print(e) | |
continue | |
def parse_voice_file(soup): | |
messages = soup.find_all('div', {'class': 'haudio'}) | |
message_ct = len(messages) | |
for x in range(len(messages)): | |
try: | |
msg_1 = messages[x] | |
iter_date = msg_1.find_all('abbr')[0].text.split( | |
',')[0] + msg_1.find_all('abbr')[0].text.split(',')[1] | |
iter_time = msg_1.find_all('abbr')[0].text.split(',')[ | |
2].split('\n')[0][1:] | |
# iter_message = msg_1.find_all('span')[4].text | |
if msg_type == 'Voicemail': | |
try: | |
iter_message = "Transcription: " + msg_1.find_all( | |
"span", class_="full-text")[0].text | |
except Exception as e: | |
iter_message = "NO TRANSCRIPTION. SEE AUDIO FILE" | |
print("Proabably no transcription:", file_path, e) | |
else: | |
try: | |
iter_message = "Call Duration: " + msg_1.find_all('abbr')[1].text[1:][:8] | |
except: | |
iter_message = '' | |
message_df.loc[len(message_df)] = [ | |
iter_date, | |
iter_time, | |
number, | |
iter_message, | |
msg_type, | |
utc_date | |
] | |
except Exception as e: | |
print('File {} encountered: {}', file_path, e) | |
continue | |
if msg_type == 'Text': | |
parse_sms_file(soup) | |
else: | |
parse_voice_file(soup) | |
# Function to save dataframe | |
def save_df(df): | |
save_path = os.path.join(exp_dir, 'Message_Export.csv') | |
df.to_csv(save_path, index=False) | |
print('Message DF saved: {}'.format(save_path)) | |
# Main function to process all files | |
def main(): | |
html_files = get_html_files(tko_calls_dir) | |
for file in html_files: | |
parse_file(file) | |
save_df(message_df) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Did you have takeout files such as
Group Conversation - 2020-09-07T16_25_43Z.html