Last active
January 12, 2022 00:54
-
-
Save naptar/7c67bd755632f7a0e5a0990424875d5b to your computer and use it in GitHub Desktop.
Convert/Parse Google Takeout/Export Data Hangouts/Chat into individual conversations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run this in the same directory as the Hangouts.json file generated by Google Takeout / Data Export tool. | |
# python3 hangouts.py | |
import json | |
import datetime | |
import os | |
import shutil | |
import re | |
chat_dir = "hangouts" | |
if os.path.exists(chat_dir): | |
print("Hangouts directory already exists. Should I remove it before proceeding? y/n") | |
if input().lower() == "y": | |
print("Removing the hangouts directory.") | |
shutil.rmtree(chat_dir) | |
else: | |
print("Exiting.") | |
quit() | |
class User: | |
def __init__(self, id, name): | |
self.id = id | |
self.name = name | |
def setName(self, name): | |
self.name = name | |
def getId(self): | |
return self.id | |
def getName(self): | |
return self.name | |
class Message: | |
def __init__(self, id, sender_id, sender_name, timestamp, text): | |
self.id = id | |
self.sender_id = sender_id | |
self.sender_name = sender_name | |
self.timestamp = datetime.datetime(1970,1,1) + datetime.timedelta(microseconds=int(timestamp)) + datetime.timedelta(hours=2) # Webkit time to UTC to GMT+2 conversion | |
self.text = text | |
def display(self): | |
return "[" + self.timestamp.strftime('%Y-%m-%d %H:%M:%S') + "] " + self.sender_name + ": " + self.text | |
class ConversationSet: | |
def __init__(self): | |
self.conversations = dict() | |
def getParticipantNameById(self, id): | |
for c in self.conversations: | |
if self.conversations[c].getParticipantById(id) is not None: | |
return self.conversations[c].getParticipantById(id).name | |
return "Unknown" | |
def addConversationParticipants(self, id, json_participant_data): | |
self.conversations[id] = Conversation(id) | |
for participant in json_participant_data: | |
p_id = participant["id"]["gaia_id"] | |
if "fallback_name" in participant: | |
self.conversations[id].addParticipant(p_id, participant["fallback_name"]) | |
else: | |
self.conversations[id].addParticipant(p_id, "") | |
def addConversationEvents(self, id, json_participant_data, json_event_data): | |
conversation = self.getConversationById(id) | |
for participant in json_participant_data: | |
p_id = participant["id"]["gaia_id"] | |
if "fallback_name" in participant: | |
self.conversations[id].setParticipantName(p_id, participant["fallback_name"]) | |
else: | |
self.conversations[id].setParticipantName(p_id, self.getParticipantNameById(p_id)) | |
for event in json_event_data: | |
if event["event_type"] == "REGULAR_CHAT_MESSAGE": | |
message_text_segments = [] | |
if "segment" in event["chat_message"]["message_content"]: | |
for segment in event["chat_message"]["message_content"]["segment"]: | |
message_text_segments.append(segment["text"]) | |
self.conversations[id].addMessage( | |
event["event_id"], | |
event["sender_id"]["gaia_id"], | |
self.conversations[id].getParticipantById(event["sender_id"]["gaia_id"]).name, | |
event["timestamp"], | |
"".join(message_text_segments) | |
) | |
def getConversations(self): | |
list = [] | |
for c in self.conversations: | |
list.append(self.conversations[c]) | |
return list | |
def getConversationById(self, id): | |
for c in self.getConversations(): | |
if c.id == id: | |
return c | |
return None | |
class Conversation: | |
def __init__(self, id): | |
self.id = id | |
self.participants = dict() | |
self.messages = [] | |
def addParticipant(self, id, name): | |
if id not in self.participants: | |
self.participants[id] = User(id, name) | |
def addMessage(self, id, sender_id, sender_name, timestamp, text): | |
self.messages.append(Message(id, sender_id, sender_name, timestamp, text)) | |
def getMessages(self): | |
return self.messages | |
def setParticipantName(self, id, name): | |
if id in self.participants: | |
self.participants[id].setName(name) | |
def getId(self): | |
return self.id | |
def getParticipants(self): | |
list = [] | |
for p in self.participants: | |
list.append(self.participants[p]) | |
return list | |
def getParticipantById(self, id): | |
if id in self.participants: | |
return self.participants[id] | |
return None | |
def participantCount(self): | |
return len(self.participants) | |
def get_valid_filename(s): | |
# https://github.com/django/django/blob/master/django/utils/text.py#L218 | |
s = str(s).strip().replace(' ', '_') | |
return re.sub(r'(?u)[^-\w.]', '', s) | |
print("Processing Hangouts.json ..") | |
with open('Hangouts.json', 'r') as f: | |
hangouts_dict = json.load(f) | |
conversations = ConversationSet() | |
for hangout in hangouts_dict["conversations"]: | |
if "conversation" in hangout: | |
conversations.addConversationParticipants( | |
hangout["conversation"]["conversation_id"]["id"], | |
hangout["conversation"]["conversation"]["participant_data"] | |
) | |
for hangout in hangouts_dict["conversations"]: | |
if "conversation" in hangout: | |
conversations.addConversationEvents( | |
hangout["conversation"]["conversation_id"]["id"], | |
hangout["conversation"]["conversation"]["participant_data"], | |
hangout["events"] | |
) | |
os.makedirs(chat_dir) | |
for c in conversations.getConversations(): | |
participants = [] | |
f_name = "-" | |
for p in c.getParticipants(): | |
f_name = p.name | |
participants.append(p.name) | |
if c.participantCount() > 1: | |
f_name = " and ".join(participants) | |
if os.path.isfile(chat_dir + "/" + get_valid_filename(f_name) + ".txt"): | |
f_name = f_name + '_2' | |
c_file = open(chat_dir + "/" + get_valid_filename(f_name) + ".txt", "w") | |
for m in c.getMessages(): | |
c_file.write(m.display() + "\n") | |
c_file.close() | |
print("Done. Check the hangouts directory for chat output files.") |
Hi there @kroy1200. I think I got this script from another place and put it here for safekeeping, though I don't think I wrote it myself. It could be that the format of the JSON file has changed, and this script no longer works, since this was uploaded in 2018.
I dont know know what I am doing wrong. Few months back it worked and now its not. Anyways, Thanks for the help.Much appreciated.
Hmm, I'm not sure. Sorry that I can't be of much help - I am out of touch with regards to hangouts/takeout and that sort of thing. From your error it seems to be that there is an invalid data field in the JSON file. You could open it manually and then navigate to position 7578 to see what it is.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Traceback (most recent call last):
File "/Users/Downloads/Takeout-2/Hangouts/hangouts.py", line 148, in
hangouts_dict = json.load(f)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/init.py", line 265, in load
return loads(fp.read(),
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/encodings/ascii.py", line 26, in decode
return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 7578: ordinal not in range(128)
having this error @fallenby