-
-
Save tomdavidson/f988be9cfab248ef09804552d35ec2bb to your computer and use it in GitHub Desktop.
Google Voice Takeout Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python script to parse google voice takeout messages into csv | |
# command line arguments | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import os | |
import argparse | |
# construct the argument parser and parse the arguments | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-i", "--input", required=True, | |
help="path to exported Google Voice directory (has 'Voice' folder inside it)") | |
ap.add_argument("-o", "--output", required=False, | |
help="path to folder where CSV file will be saved. Google Voice directory is used if not supplied") | |
args = vars(ap.parse_args()) | |
# print('input: {}'.format(args["input"])) | |
takeout_dir = args["input"] | |
exp_dir = args["output"] or takeout_dir | |
tko_calls_dir = os.path.join(takeout_dir, 'Voice', 'Calls') | |
message_df = pd.DataFrame( | |
columns=['Date', 'Time', 'From', 'Content', 'Type', 'Date (UTC)']) | |
# Function to get all text log files | |
def get_html_files(calls_dir): | |
file_list = [] | |
for file in os.listdir(calls_dir): | |
if file.endswith(".html"): | |
file_list.append(os.path.join(calls_dir, file)) | |
else: | |
continue | |
return file_list | |
# Function to process file | |
def parse_file(file_path): | |
print(f"Processing file: {file_path}") | |
basename, extension = os.path.splitext(os.path.basename(file_path)) | |
parts = basename.split(' - ') | |
if parts[0] == 'Group Conversation': | |
msg_type = 'Text' | |
utc_date = parts[1] | |
number = '' # You may want to set a default value or handle this case differently | |
elif len(parts) > 3: # Number has a '-' in it | |
number = ' - '.join(parts[:-2]) # Join all parts except the last two | |
utc_date = parts[-1] | |
msg_type = parts[-2] | |
elif len(parts) == 2: # No message type | |
number = parts[0] | |
utc_date = parts[1] | |
msg_type = 'Unknown' # or 'Text' if you want to assume it's a text message | |
else: | |
number, msg_type, utc_date = parts | |
soup = BeautifulSoup(open(file_path), 'html.parser') | |
# Rest of the function remains the same | |
def parse_sms_file(soup): | |
messages = soup.find_all('div', {'class': 'message'}) | |
message_ct = len(messages) | |
for x in range(len(messages)): | |
try: | |
msg_1 = messages[x] | |
iter_date = msg_1.find_all('abbr')[0].text.split( | |
',')[0] + msg_1.find_all('abbr')[0].text.split(',')[1] | |
iter_time = msg_1.find_all('abbr')[0].text.split(',')[ | |
2].split('\n')[0][1:] | |
# iter_number = msg_1.find_all('a')[0].text.replace('+', '') | |
iter_message = msg_1.find_all('q')[0].text | |
message_df.loc[len(message_df)] = [ | |
iter_date, | |
iter_time, | |
number, | |
iter_message, | |
msg_type, | |
utc_date | |
] | |
except Exception as e: | |
print(e) | |
continue | |
def parse_voice_file(soup): | |
messages = soup.find_all('div', {'class': 'haudio'}) | |
message_ct = len(messages) | |
for x in range(len(messages)): | |
try: | |
msg_1 = messages[x] | |
iter_date = msg_1.find_all('abbr')[0].text.split( | |
',')[0] + msg_1.find_all('abbr')[0].text.split(',')[1] | |
iter_time = msg_1.find_all('abbr')[0].text.split(',')[ | |
2].split('\n')[0][1:] | |
# iter_message = msg_1.find_all('span')[4].text | |
if msg_type == 'Voicemail': | |
try: | |
iter_message = "Transcription: " + msg_1.find_all( | |
"span", class_="full-text")[0].text | |
except Exception as e: | |
iter_message = "NO TRANSCRIPTION. SEE AUDIO FILE" | |
print("Proabably no transcription:", file_path, e) | |
else: | |
try: | |
iter_message = "Call Duration: " + msg_1.find_all('abbr')[1].text[1:][:8] | |
except: | |
iter_message = '' | |
message_df.loc[len(message_df)] = [ | |
iter_date, | |
iter_time, | |
number, | |
iter_message, | |
msg_type, | |
utc_date | |
] | |
except Exception as e: | |
print('File {} encountered: {}', file_path, e) | |
continue | |
if msg_type == 'Text': | |
parse_sms_file(soup) | |
else: | |
parse_voice_file(soup) | |
# Function to save dataframe | |
def save_df(df): | |
save_path = os.path.join(exp_dir, 'Message_Export.csv') | |
df.to_csv(save_path, index=False) | |
print('Message DF saved: {}'.format(save_path)) | |
# Main function to process all files | |
def main(): | |
html_files = get_html_files(tko_calls_dir) | |
for file in html_files: | |
parse_file(file) | |
save_df(message_df) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment