Last active
February 2, 2021 17:11
-
-
Save jsharkey13/d60b7b421e08c98d426d03c39f8b4a12 to your computer and use it in GitHub Desktop.
Facebook Parser (Python 3) (2018-09-09)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2018 James Sharkey (https://github.com/jsharkey13/facebook_message_parser) | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
import datetime | |
import matplotlib.pyplot as plt | |
from matplotlib.dates import date2num, num2date | |
from matplotlib import ticker | |
import matplotlib | |
import re | |
# ============================================================================= | |
# Top N Most Messaged People # | |
# # | |
# Public Functions: # | |
# - top_n_people(Chat, N, count_type, groups) # | |
# # | |
# ============================================================================= | |
_COUNT_TYPES = ["total", "to", "from", "allfrom", "words", "wordsfrom", "wordsto", | |
"chars", "charsfrom", "charsto"] | |
def _update_thread_dict(thread_dict, thread_name, num): | |
"""Add new entries to count dictionary, dealing with duplicates carefully.""" | |
if thread_name not in thread_dict: | |
thread_dict.update({thread_name: num}) | |
else: # Deal with duplicates, otherwise old entries get overwritten: | |
thread_dict[thread_name] += num | |
def top_n_people(Chat, N=-1, count_type="total", groups=False): | |
"""Return a list of the top N most messaged people. | |
The "Top N People" can be judged by one of four criteria. The list | |
contains tuples of (name, message count). A negative or zero value for | |
N returns the full list, this is the default. The optional argument | |
'groups' allows group conversations to be included where this makes | |
sense. The 'count_type' argument can be one of four values: | |
- "total" - the default. This counts the total number of messages in | |
message threads, and sorts by this. Groups can be enabled. | |
- "to" - the total number of messages sent in a direct thread by | |
the current user: '_owner'. Groups can be enabled. | |
- "from" - the total number of messages sent in a direct thread by | |
the other person in the thread. If 'groups' is enabled, all messages | |
not from '_owner' are counted. | |
- "allfrom" - the total number of messages from each individual person | |
across all threads. Groups cannot be enabled and will be ignored.""" | |
thread_dict = {} | |
if count_type is "to": | |
# Count the number of messages sent directly to each person. | |
for t in Chat.threads: | |
num = len(t.by(Chat._owner)) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "from": | |
# Count the number of messages received directly from each person. | |
for t in Chat.threads: | |
my_num = len(t.by(Chat._owner)) | |
tot_num = len(t) | |
num = tot_num - my_num | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "allfrom": | |
# Count all messages in all threads received from each person. | |
all_people = Chat._all_people.copy() | |
all_people.remove(Chat._owner) # Remove _owner from all_people (but not the original!): | |
for p in all_people: | |
num = len(Chat.all_from(p)) | |
thread_dict.update({p: num}) | |
elif count_type is "words": | |
# Count total number of words exchanged in threads. | |
for t in Chat.threads: | |
num = 0 | |
for m in t.messages: | |
num += len(re.findall(r'\S+', m.text)) # Matches any non-whitespace sub-string | |
# num += len(m.text.split(" ")) # Counts all things separated by a space | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "wordsfrom": | |
# Count total number of words sent by other people in threads. | |
for t in Chat.threads: | |
num = 0 | |
for m in t.messages: | |
if not m.sent_by(Chat._owner): | |
num += len(re.findall(r'\S+', m.text)) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "wordsto": | |
# Count total number of words sent to the other people in threads. | |
for t in Chat.threads: | |
num = 0 | |
for m in t.messages: | |
if m.sent_by(Chat._owner): | |
num += len(re.findall(r'\S+', m.text)) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "chars": | |
# Count total number of characters exchanged in threads. | |
for t in Chat.threads: | |
num = 0 | |
for m in t.messages: | |
num += len(m) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "charsfrom": | |
# Count total number of characters sent by other people in threads. | |
for t in Chat.threads: | |
num = 0 | |
for m in t.messages: | |
if not m.sent_by(Chat._owner): | |
num += len(m) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
elif count_type is "charsto": | |
# Count total number of characters sent to the other people in threads. | |
for t in Chat.threads: | |
num = 0 | |
for m in t.messages: | |
if m.sent_by(Chat._owner): | |
num += len(m) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
else: | |
# Else the default: count the total messages in each thread. | |
for t in Chat.threads: | |
num = len(t) | |
_update_thread_dict(thread_dict, t.people_str, num) | |
sorted_list = sorted(thread_dict.items(), key=lambda tup: tup[1], reverse=True) | |
top_n = [] | |
for i, item in enumerate(sorted_list): | |
if ((len(top_n) >= N) and (N > 0)): | |
return top_n | |
if ((len(item[0].split(", ")) == 1) or groups): | |
top_n.append((item[0], item[1])) | |
return top_n | |
# ============================================================================= | |
# Graphing Message Counts # | |
# # | |
# Public Functions: # | |
# - use_facebook_colours() # | |
# - use_ios_colours() # | |
# - messages_time_graph(Chat, name, filename, no_gui) # | |
# - messages_date_graph(Chat, name, filename, start_date, end_date, no_gui) # | |
# - messages_pie_chart(Chat, N, filename, count_type, groups, # | |
# no_gui, percentages) # | |
# # | |
# ============================================================================= | |
# Some useful colours: | |
_FB_BLUE = (0.2314, 0.3490, 0.5961) | |
_FB_GREY = (0.9294, 0.9294, 0.9294) | |
_IOS_GREEN = (0.5451, 0.8235, 0.2824) | |
_IOS_GREY = (0.8980, 0.8980, 0.9176) | |
# The colours used by the code: | |
_BG_COLOUR = (1.0, 1.0, 1.0) | |
_TEXT_COLOUR = (0.0, 0.0, 0.0) | |
_MY_COLOUR = None | |
_OTHER_COLOUR = None | |
def _change_matplotlib_colours(text_color=_TEXT_COLOUR, bg_colour=_BG_COLOUR): | |
"""Change matplotlib default colors for ALL graphs produced in current session. | |
- 'text_colour' sets the colour of all text, as well as axes colours and | |
axis tick mark colours. | |
- 'bg_colour' changes the background and outside fill colour of the plot.""" | |
matplotlib.rc('figure', facecolor=_BG_COLOUR) | |
matplotlib.rc('savefig', facecolor=_BG_COLOUR, edgecolor=_TEXT_COLOUR) | |
matplotlib.rc('axes', edgecolor=_TEXT_COLOUR, facecolor=_BG_COLOUR, labelcolor=_TEXT_COLOUR) | |
matplotlib.rc('text', color=_TEXT_COLOUR) | |
matplotlib.rc('grid', color=_TEXT_COLOUR) | |
matplotlib.rc('xtick', color=_TEXT_COLOUR) | |
matplotlib.rc('ytick', color=_TEXT_COLOUR) | |
def _change_graph_colours(my_colour, other_colour): | |
"""Change the colours used in histograms, both self colour and the other person colour.""" | |
global _MY_COLOUR, _OTHER_COLOUR | |
_MY_COLOUR = my_colour | |
_OTHER_COLOUR = other_colour | |
def use_facebook_colours(): | |
"""Use Facebook's colours for graphs; blue for self, grey for others.""" | |
_change_graph_colours(my_colour=_FB_BLUE, other_colour=_FB_GREY) | |
def use_ios_colours(): | |
"""Use iOS's colours for graphs; green for self, grey for others.""" | |
_change_graph_colours(my_colour=_IOS_GREEN, other_colour=_IOS_GREY) | |
# Run the colour change code on import of the module: | |
use_facebook_colours() | |
_change_matplotlib_colours() | |
# ====== Histogram of Time of Day: | |
def _hour_list(): | |
"""Generate a list containing hours in day converted to floats.""" | |
hours_bins = [n / 24.0 for n in range(0, 25)] | |
return hours_bins | |
def _dt_to_decimal_time(datetime): | |
"""Convert a datetime.datetime object into a fraction of a day float. | |
Take the decimal part of the date converted to number of days from 01/01/0001 | |
and return it. It gives fraction of way through day: the time.""" | |
datetime_decimal = date2num(datetime) | |
time_decimal = datetime_decimal - int(datetime_decimal) | |
return time_decimal | |
def messages_time_graph(Chat, name=None, filename=None, no_gui=False): | |
"""Create a graph of the time of day of messages sent between users. | |
Produces a histogram of the times of messages sent to and received from | |
another user. The method only works for individuals, not for threads between | |
multiple friends. | |
- 'Chat' should be the Chat object to analyse. | |
- 'name' should be the name of the user, and so the Thread, to be graphed. | |
A special case is when 'name' is the name of the current user, in which | |
case the graph of ALL messages the current user has sent is produced. | |
- If a 'filename' is specified, output to file as well as displaying | |
onscreen for viewing. | |
- To run without displaying a graph onscreen, set 'no_gui' to True. If no filename | |
is specified with this, the function will run but produce no output anywhere.""" | |
# Implement a default case: | |
if name is None: | |
name = Chat._owner | |
# Divide up into hourly bins, changing datetime objects to times in range [0,1): | |
bins = _hour_list() | |
# If looking at graph with other users, get messages to and from: | |
if name != Chat._owner: | |
Thread = Chat[name] | |
times_from = [_dt_to_decimal_time(message.date_time) for message in Thread.by(name)] | |
times_to = [_dt_to_decimal_time(message.date_time) for message in Thread.by(Chat._owner)] | |
label = [Chat._owner, name] | |
else: # If looking at all messages sent; do things differently: | |
times_from = [_dt_to_decimal_time(message.date_time) for message in Chat.all_messages() if message.author != Chat._owner] | |
times_to = [_dt_to_decimal_time(message.date_time) for message in Chat.all_messages() if message.author == Chat._owner] | |
label = [Chat._owner, "Others"] | |
# Create the figure, hiding the display if no_gui set: | |
if no_gui: | |
plt.ioff() | |
plt.figure(figsize=(18, 9), dpi=80) | |
plt.hist([times_to, times_from], bins, histtype='bar', color=[_MY_COLOUR, _OTHER_COLOUR], label=label, stacked=True) | |
# Title the graph correctly, and label axes: | |
if name != Chat._owner: | |
plt.suptitle("Messages with " + name, size=18) | |
else: | |
plt.suptitle("All Messages Sent", size=18) | |
plt.xlabel("Time of Day", labelpad=20, size=15) | |
plt.ylabel("Number of Messages", labelpad=20, size=15) | |
# Move tick marks to centre of hourly bins by adding ~ half an hour (in days) | |
axes = plt.gca() | |
axes.set_xticks([b + 0.02 for b in bins]) | |
# Place tickmarks | |
plt.xticks(rotation=0, ha='center') | |
# Change the tick marks from useless fraction through day, to recognisable times: | |
# To do this use strftime to convert times to string (which needs dates >= 1900), | |
# so shift to 1900 (add 693596 days) and take off added half hour (minus 0.02) | |
axes.xaxis.set_major_formatter(ticker.FuncFormatter(lambda numdate, _: num2date(numdate + 693596 - 0.02).strftime('%H:%M'))) | |
# Add some space at either end of the graph (axis in number of days, so +- 15 mins): | |
plt.xlim([bins[0] - 0.01, bins[-1] + 0.01]) | |
# Place y gridlines beneath the plot: | |
axes.yaxis.grid(True) | |
axes.set_axisbelow(True) | |
# Hide unnecessary borders and tickmarks: | |
axes.spines['right'].set_visible(False) | |
axes.spines['top'].set_visible(False) | |
axes.yaxis.set_ticks_position('left') | |
plt.tick_params(axis='x', which='both', bottom=False, top=False) | |
# Add the legend at the top, underneath the title but outside the figure: | |
plt.legend(frameon=False, bbox_to_anchor=(0.5, 1.05), loc=9, ncol=2, borderaxespad=0) | |
# If given a filename, output to file: | |
if ((filename is not None) and (type(filename) is str)): | |
plt.savefig(filename, bbox_inches='tight') | |
else: | |
plt.show() | |
# ====== Histogram of Date: | |
def _month_list(d1, d2): | |
"""Generate a list of months between d1 and d2 inclusive. | |
The list includes the months containing d1 and d2, with an extra month | |
on the end for the upper limit of a histogram.""" | |
months = [] | |
d1 = datetime.datetime(d1.year, d1.month, 1) | |
try: | |
d2 = datetime.datetime(d2.year, d2.month + 1, 1) | |
# If month is 12 (=December), adding one causes error: | |
except ValueError: | |
# So January of the next year instead | |
d2 = datetime.datetime(d2.year + 1, 1, 1) | |
# Just generate all months in the required years-range, including unecessary ones | |
for y in range(d1.year, d2.year + 1): | |
for m in range(1, 13): | |
months.append(datetime.datetime(y, m, 1)) | |
# Then remove extra months | |
months = [m for m in months if (d1 <= m <= d2)] | |
return months | |
def messages_date_graph(Chat, name=None, filename=None, start_date=None, end_date=None, no_gui=False): | |
"""Create a graph of the number of messages sent between users. | |
Produces a graph of messages sent to and received from another user. The | |
method only works for individuals, not for threads between multiple friends. | |
- 'Chat' should be the Chat object to analyse. | |
- 'name' should be the name of the user, and so the Thread, to be graphed. | |
A special case is when 'name' is the name of the current user, in which | |
case the graph of ALL messages the current user has sent is produced. | |
- If a 'filename' is specified, output to file as well as displaying | |
onscreen for viewing. | |
- 'start_date' and 'end_date' can be used to narrow the range of dates | |
covered; the default is the first message to the last, but specifying dates | |
inside this range can be used to narrow down the region considered. | |
- To run without displaying a graph onscreen, set 'no_gui' to True. If no filename | |
is specified with this, the function will run but produce no output anywhere.""" | |
# Implement a default case: | |
if name is None: | |
name = Chat._owner | |
# Sanity check input dates, and fix if necessary (note MUST be one line to avoid reassignment before comparison): | |
if ((start_date is not None) and (end_date is not None)): | |
start_date, end_date = min(start_date, end_date), max(start_date, end_date) | |
# If looking at graph with other users, get messages to and from: | |
if name != Chat._owner: | |
Thread = Chat[name] | |
# If a start date given (which is after the message thread starts), use it: | |
if start_date is None: | |
d_min = Thread[0].date_time | |
else: | |
d_min = max(Chat._date_parse(start_date), Thread[0].date_time) | |
# If an end date given (which is before the message thread ends), use it: | |
if end_date is None: | |
d_max = Thread[-1].date_time | |
else: | |
d_max = min(Chat._date_parse(end_date), Thread[-1].date_time) | |
dates_from = [date2num(message.date_time) for message in Thread.by(name)] | |
dates_to = [date2num(message.date_time) for message in Thread.by(Chat._owner)] | |
label = [Chat._owner, name] | |
# If looking at all messages sent; do things differently: | |
else: | |
message_list = Chat.all_messages() | |
# If a start date given (which is after the message thread starts), use it: | |
if start_date is None: | |
d_min = message_list[0].date_time | |
else: | |
d_min = max(Chat._date_parse(start_date), message_list[0].date_time) | |
# If an end date given (which is before the message thread ends), use it: | |
if end_date is None: | |
d_max = message_list[-1].date_time | |
else: | |
d_max = min(Chat._date_parse(end_date), message_list[-1].date_time) | |
dates_from = [date2num(message.date_time) for message in message_list if message.author != Chat._owner] | |
dates_to = [date2num(message.date_time) for message in message_list if message.author == Chat._owner] | |
label = [Chat._owner, "Others"] | |
# Divide up into month bins, changing datetime objects to number of days for plotting: | |
bins = [date2num(b) for b in _month_list(d_min, d_max)] | |
# Create the figure, hiding the display if no_gui set: | |
if no_gui: | |
plt.ioff() | |
plt.figure(figsize=(18, 9), dpi=80) | |
plt.hist([dates_to, dates_from], bins, histtype='bar', color=[_MY_COLOUR, _OTHER_COLOUR], label=label, stacked=True) | |
# Title the graph correctly, and label axes: | |
if name != Chat._owner: | |
plt.suptitle("Messages with " + name, size=18) | |
else: | |
plt.suptitle("All Messages Sent", size=18) | |
plt.ylabel("Number of Messages", labelpad=20, size=15) | |
# Put the tick marks at the rough centre of months by adding 15 days (~ 1/2 a month): | |
axes = plt.gca() | |
axes.set_xticks([b + 15 for b in bins]) | |
# The x labels are unreadbale at angle if more than ~50 of them, put them vertical if so: | |
if len(bins) > 45: | |
plt.xticks(rotation='vertical') | |
else: | |
plt.xticks(rotation=30, ha='right') | |
# Change the tick marks from useless number of days, to recognisable dates: | |
axes.xaxis.set_major_formatter(ticker.FuncFormatter(lambda numdate, _: num2date(numdate).strftime('%b %Y'))) | |
# Add some space at either end of the graph (axis in number of days, so -10 days and +5 days): | |
plt.xlim([bins[0] - 10, bins[-1] + 5]) | |
# Place y gridlines beneath the plot: | |
axes.yaxis.grid(True) | |
axes.set_axisbelow(True) | |
# Hide unnecessary borders and tickmarks: | |
axes.spines['right'].set_visible(False) | |
axes.spines['top'].set_visible(False) | |
axes.yaxis.set_ticks_position('left') | |
plt.tick_params(axis='x', which='both', bottom=False, top=False) | |
# Add the legend at the top, underneath the title but outside the figure: | |
plt.legend(frameon=False, bbox_to_anchor=(0.5, 1.05), loc=9, ncol=2, borderaxespad=0) | |
# If given a filename, output to file: | |
if ((filename is not None) and (type(filename) is str)): | |
plt.savefig(filename, bbox_inches='tight') | |
else: | |
plt.show() | |
# ====== Pie Chart of Totals: | |
# Colours from http://www.mulinblog.com/a-color-palette-optimized-for-data-visualization/ | |
_COLOURS = ['#5DA5DA', '#FAA43A', '#60BD68', '#F17CB0', '#B2912F', '#B276B2', '#DECF3F', '#F15854'] | |
def _make_labels_wrap(labels): | |
"""Break labels which contain more than one name into multiple lines.""" | |
for i, l in enumerate(labels): | |
if len(l) > 25: | |
# Split lines at ", " and rejoin with newline. | |
labels[i] = '\n'.join(l.split(", ")) | |
return labels | |
def messages_pie_chart(Chat, N=10, filename=None, count_type="total", groups=False, | |
no_gui=False, percentages=True): | |
"""Create a pie chart of the number of messages exchanged with friends. | |
The graph shows the most messaged friends sorted using the top_n_people() | |
code. The graph also shows percentage sizes of wedges, though this can be disabled. | |
- 'Chat' should be the Chat object to analyse. | |
- 'N' should be how many people to show explicitly; all others are grouped | |
together in a final chunk. | |
- If a 'filename' is specified, output to file as well as displaying | |
onscreen for viewing. | |
- The 'count_type' argument is passed to top_n_people() and so one of the | |
four valid counts can be used. | |
- Setting 'groups' to True will include message threads with groups where | |
appropriate. | |
- To run without displaying a graph onscreen, set 'no_gui' to True. If no filename | |
is specified with this, the function will run but produce no output anywhere. | |
- The percentages on the graph can be removed by setting 'percentages' to | |
False.""" | |
# The title of the graph depends on the count_type: | |
_title_dict = {"total": "Total Lengths of Message Threads", | |
"allfrom": "Total Number of Messages Received", | |
"from": "Number of Messages Received from People in Personal Threads", | |
"to": "Number of Messages Sent to People in Personal Threads", | |
"words": "Total Word Counts of Message Threads", "wordsfrom": "Word Count of All Messages Received from People in Personal Threads", | |
"wordsto": "Word Count of All Messages Sent to People in Personal Threads", | |
"chars": "Total Character Lengths of Message Threads", | |
"charsfrom": "Character Length of All Messages Received from People in Personal Threads", | |
"charsto": "Character Length of All Messages Sent to People in Personal Threads"} | |
# The data to plot: | |
thread_counts = top_n_people(Chat, count_type=count_type, groups=groups) | |
# Set up useful lists and counts: | |
names = [] | |
counts = [] | |
other_count = 0 | |
colours = [] | |
# Run through the data, adding it to the correct list. If not in N, add to Other: | |
for n, t in enumerate(thread_counts): | |
if n < N: | |
names.append(t[0]) | |
counts.append(t[1]) | |
colours.append(_COLOURS[n % len(_COLOURS)]) | |
else: | |
other_count += t[1] | |
# Add an "Others" section in dark grey using the other_count: | |
names.append("Others") | |
counts.append(other_count) | |
colours.append('#4D4D4D') | |
# If long names, wrap them: | |
_make_labels_wrap(names) | |
# Create the figure, hiding the display if no_gui set: | |
if no_gui: | |
plt.ioff() | |
plt.figure(figsize=(18, 9), dpi=80) | |
# We want the edges of the wedges in the chart to be white for aesthetics: | |
plt.rcParams['patch.edgecolor'] = 'white' | |
# Plot percentage counts on the figure: | |
if percentages: | |
pct = '%1.1f%%' | |
else: | |
pct = None | |
# Make the plot, starting at the top (90 degrees from horizontal) and percentages outside (pctdist > 1) | |
plt.pie(counts, colors=colours, autopct=pct, pctdistance=1.1, startangle=90, counterclock=False) | |
# Put the right title on the graph: | |
plt.suptitle(_title_dict[count_type], size=18) | |
# And make it circular: | |
plt.axis('equal') | |
# Add the legend: | |
plt.legend(labels=names, frameon=False, labelspacing=1, loc="center", bbox_to_anchor=[0, 0.5]) | |
# If given a filename, output to file: | |
if ((filename is not None) and (type(filename) is str)): | |
plt.savefig(filename, bbox_inches='tight') | |
else: | |
plt.show() | |
# To get white outlines, we changed default. Fix this: | |
plt.rcParams['patch.edgecolor'] = _TEXT_COLOUR | |
# ============================================================================= | |
# Word Frequency Analysis # | |
# # | |
# Public Functions: # | |
# - top_word_use(Chat, name, from_me, ignore_single_words) # | |
# # | |
# ============================================================================= | |
def _str_to_word_list(text): | |
"""Turn a string into a list of words, removing URLs and punctuation. | |
- The function takes in a string and returns a list of strings.""" | |
# Some characters and strings need deleting from messages to separate them into proper words: | |
_EXCLUDE = ["'s", "'ll", ".", ",", ":", ";", "!", "?", "*", '"', "-", "+", "^", "_", "~", "(", ")", "[", "]", "/", "\\", "@", "="] | |
# Some things need removing, but not deleting as with _EXCLUDE: | |
_CHANGE = {"'": "", ":p": "tongueoutsmiley", ":-p": "tongueoutsmiley", | |
":)": "happyfacesmiley", ":-)": "happyfacesmiley", ":/": "awkwardfacesmiley", | |
":-/": "awkwardfacesmiley", "<3": "loveheartsmiley", ":(": "sadfacesmiley", | |
":-(": "sadfacesmiley", ":'(": "cryingfacesmiley", ":d": "grinningfacesmiley", | |
":-d": "grinningfacesmiley", ";)": "winkfacesmiley", ";-)": "winkfacesmiley", | |
":o": "shockedfacesmiley"} | |
# Remove URLs with a regular expression, else they mess up when removing punctuation: | |
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text) | |
# Remove the NEWLINE denoting string, and replace with a space before anything else: | |
text = text.replace("<|NEWLINE|>", " ") | |
text = text.lower() | |
# Change and exclude things: | |
for old, new in _CHANGE.items(): | |
text = text.replace(old, new) | |
for ex in _EXCLUDE: | |
text = text.replace(ex, " ") | |
# A hack to replace all whitespace with one space: | |
text = " ".join(text.split()) | |
# Get rid of non-ASCII characters for simplicity | |
text = text.encode('ascii', 'replace') | |
# Return a list of words: | |
return text.split() | |
def _message_list_word_list(messages): | |
"""Take a list of Message objects and return a list of strings. | |
The returned list of strings contains all of the words in the messages.""" | |
word_list = [] | |
for m in messages: | |
word_list.extend(_str_to_word_list(m.text)) | |
return word_list | |
def _word_list_to_freq(words, ignore_single_words=False): | |
"""Take a list of strings, and return a list of (word, word_use_count). | |
- The returned list of pairs is sorted in descending order. | |
- Passing 'ignore_single_words' will remove any words only used once in | |
a message thread.""" | |
# The order of items in the CHANGE dictionary means changing back isn't quite so simple; just use a second dictionary: | |
_CHANGE_BACK = {"tongueoutsmiley": ":P", "happyfacesmiley": ":)", "awkwardfacesmiley": ":/", | |
"loveheartsmiley": "<3", "sadfacesmiley": ":(", "cryingfacesmiley": ":'(", | |
"grinningfacesmiley": ":D", "winkfacesmiley": ";)", "shockedfacesmiley": ":o"} | |
# Make a dictionary of words and their total count: | |
freq = {x: words.count(x) for x in words} | |
# Change the emoticons back to emoticons: | |
for new, old in _CHANGE_BACK.items(): | |
if new in freq: | |
freq[old] = freq.pop(new) | |
# Convert to a list and sort: | |
freq = sorted(freq.items(), key=lambda tup: tup[1], reverse=True) | |
# If only want words used more than once, remove those with count <= 1 | |
if ignore_single_words: | |
freq = [f for f in freq if f[1] > 1] | |
return freq | |
def top_word_use(Chat, name, from_me=False, ignore_single_words=False): | |
"""Work out the most commonly used words by a friend. | |
The function returns a list of (word, word_use_count) tuples. For long threads, | |
THIS FUNCTION WILL TAKE A VERY LONG TIME, due to the analysis being done | |
directly in Python, not in a module using the faster C or C++. | |
- 'name' is a string of the name of the Thread to consider. | |
- 'from_me' is a boolean flag to consider messages sent by you to 'name' | |
if True, otherwise messages received from 'name' are used, the default. | |
- Setting 'ignore_single_words' to True removes words which are only used | |
once, which reduces the length of the list returned.""" | |
if name != Chat._owner: | |
if from_me: | |
messages = Chat[name].by(Chat._owner) | |
else: | |
messages = Chat[name].by(name) | |
else: | |
messages = Chat.all_from(Chat._owner) | |
wlist = _message_list_word_list(messages) | |
freq = _word_list_to_freq(wlist, ignore_single_words) | |
return freq |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2018 James Sharkey (https://github.com/jsharkey13/facebook_message_parser) | |
# Copyright (c) 2015 Chris Copley (https://github.com/CopOnTheRun/FB-Message-Parser) | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
import csv | |
import datetime | |
import json | |
import zipfile | |
from io import StringIO | |
import pytz | |
import tzlocal | |
import dateutil.tz | |
import facebook_analysis_demo | |
# User Configuration: | |
FACEBOOK_ZIPFILE = "facebook-username.zip" | |
LOCAL_TIMEZONE = "Europe/London" | |
MY_FACEBOOK_NAME = "My Name Here" | |
UNKNOWN_USER_MAP = { | |
"DemoThreadId": "Demo Replacement Name" | |
} | |
# Useful constants: | |
MESSAGE_TYPE = "Facebook" | |
ZIP_MESSAGE_DIR = "messages/" | |
THREAD_ID_TEMPLATE = "FACEBOOK_{0:s}" | |
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" | |
FACEBOOK_USER = "Facebook User" | |
local_timezone = dateutil.tz.gettz(LOCAL_TIMEZONE) | |
# Some old classes that will be useful for analysis code: | |
class Message(object): | |
"""An object to encapsulate a Message. | |
- Contains a string of the author's name, the timestamp, number in the thread | |
and the body of the message. | |
- When initialising, 'thread_id' should be the containing Thread's ID, | |
'author' should be string containing the message sender's name, 'date_time' | |
should be a datetime.datetime object, 'text' should be the content of | |
the message and 'number' should be the number of the message in the thread.""" | |
@staticmethod | |
def _date_parse(date): | |
"""Allow dates to be entered as integer tuples (YYYY, MM, DD[, HH, MM, SS]). | |
Removes the need to supply datetime objects, but still allows dates | |
to be entered as datetime.datetime objects. The Year, Month and | |
Day are compulsory, the Hours and Minutes optional. May cause exceptions | |
if poorly formatted tuples are used.""" | |
if isinstance(date, datetime.datetime): | |
return date.astimezone(pytz.utc) | |
else: | |
local_tz = tzlocal.get_localzone() | |
return local_tz.localize(datetime.datetime(*date)).astimezone(pytz.utc) | |
@staticmethod | |
def _csv_header_row(): | |
return ["timestamp", "thread_id", "message_number", "author", "message"] | |
def __init__(self, *, source, thread_id, author, date_time, text, number, attachments=None): | |
self.source = source | |
self.thread_id = thread_id | |
self.author = author | |
self.date_time = date_time.astimezone(local_timezone) | |
self.text = text | |
if isinstance(attachments, (list, set)): | |
self.attachments = attachments | |
else: | |
self.attachments = [attachments] if attachments is not None else [] | |
self._num = number | |
def __repr__(self): | |
"""Set Python's representation of the Message object.""" | |
return "<MESSAGE: THREAD='{}' NUMBER='{}' TIMESTAMP='{}' AUTHOR='{}' MESSAGE='{}'>". \ | |
format(self.thread_id, self._num, self.date_time, self.author, self.as_text()) | |
def __str__(self): | |
"""Return a string form of a Message.""" | |
return repr(self) | |
def __lt__(self, message): | |
"""Allow sorting of messages by implementing the less than operator. | |
Sorting is by date, unless two messages were sent at the same time, | |
in which case message number is used to resolve conflicts. This number | |
ordering holds fine for messages in single threads, but offers no real | |
objective order outside a thread.""" | |
if self.date_time == message.date_time: | |
return self._num < message._num | |
else: | |
return self.sent_before(message.date_time) | |
def __gt__(self, message): | |
"""Allow sorting of messages by implementing the greater than operator. | |
Sorting is by date, unless two messages were sent at the same time, | |
in which case message number is used to resolve conflicts. This number | |
ordering holds fine for messages in single threads, but offers no real | |
objective order outside a thread.""" | |
if self.date_time == message.date_time: | |
return self._num > message._num | |
else: | |
return self.sent_after(message.date_time) | |
def __eq__(self, message): | |
"""Messages are equal if their number, date, author and text are the same.""" | |
equal = (self._num == message._num) and (self.author == message.author) | |
equal = equal and (self.date_time == message.date_time) and (self.text == message.text) | |
return equal | |
def __len__(self): | |
"""Return the number of characters in the message body.""" | |
if self.text is not None: | |
return len(self.text) | |
return 0 | |
def is_empty_message(self): | |
"""Return True if the message contains no content.""" | |
return ((self.text is None) or (self.text == "")) and len(self.attachments) == 0 | |
def as_text(self): | |
"""Return a text representation of the message, including any attachments.""" | |
text_form = "" | |
if self.text is not None: | |
text_form += self.text.replace("\n", "\\n") | |
for r in self.attachments: | |
text_form += "\\n - {0}".format(r) | |
return text_form | |
def sent_by(self, name): | |
"""Return True if the message was sent by 'name'.""" | |
return self.author == name | |
def sent_before(self, date): | |
"""Return True if the message was sent before the date specified. | |
The 'date' can be a datetime.datetime object, or a three or five tuple | |
(YYYY, MM, DD[, HH, MM]).""" | |
date = self._date_parse(date) | |
return self.date_time < date | |
def sent_after(self, date): | |
"""Return True if the message was sent after the date specified. | |
The 'date' can be a datetime.datetime object, or a three or five tuple | |
(YYYY, MM, DD[, HH, MM]).""" | |
date = self._date_parse(date) | |
return self.date_time > date | |
def sent_between(self, start, end=None): | |
"""Return True if the message was sent between the dates specified. | |
- The 'start' and 'end' can be datetime.datetime objects, or | |
a three or five tuple (YYYY, MM, DD[, HH, MM]). The start and end times | |
are inclusive since this is simplest. | |
- Not entering an 'end' date is interpreted as all messages sent on | |
the day 'start'. Where a time is specified also, a 24 hour period | |
beginning at 'start' is used.""" | |
start = self._date_parse(start) | |
if end is not None: | |
end = self._date_parse(end) | |
else: | |
end = start + datetime.timedelta(days=1) # 1 day later than 'start' | |
return start <= self.date_time <= end | |
def contains(self, search_string, *, ignore_case=False): | |
"""Return True if 'search_string' is contained in the message text.""" | |
if ignore_case: | |
return search_string.lower() in self.text.lower() | |
else: | |
return search_string in self.text | |
def to_csv(self, filename=None, *, add_header=True, _file_handle=None): | |
"""Output the message as a comma-separated string | |
- An optional header row is added by default, but can be turned off | |
by setting 'add_header' to False. | |
- Output is to file if 'filename' is provided, else the method returns | |
a string of the message in CSV format. | |
- An open file handle or object supporting the write(...) method can be | |
provided as the '_file_handle' argument, but this is primarily internal.""" | |
if (filename is not None) or (_file_handle is not None): | |
file_handle = _file_handle or open(filename, mode='w', encoding='utf-8') | |
else: | |
file_handle = StringIO() | |
csv_writer = csv.writer(file_handle, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') | |
if add_header: | |
csv_writer.writerow(self._csv_header_row()) | |
csv_writer.writerow([self.date_time, self.thread_id, self._num, self.author, self.as_text()]) | |
if filename is not None: | |
file_handle.close() | |
elif (filename is None) and (_file_handle is None): | |
# Return a string: | |
return file_handle.getvalue().strip("\r\n") | |
class Thread(object): | |
"""An object to encapsulate a Message thread. | |
- Contains a list of participants, a string form of the list and a list | |
of messages in the thread as Message objects. | |
- When initialising, 'people' should be a list of the names of the | |
participants either comma-separated in a string or an actual list, | |
and 'messages' should be a list of Message objects.""" | |
def __init__(self, *, _id, owner, people, messages, clean=True): | |
if not all(isinstance(m, Message) for m in messages): | |
raise TypeError("The message list must be Message objects!") | |
self._id = str(_id) | |
if isinstance(people, (list, set)): | |
self.people = sorted(people) | |
else: | |
self.people = sorted(people.split(", ")) | |
self.people.remove(owner) | |
self.people_str = ", ".join(self.people) | |
self._owner = owner | |
self.messages = sorted(messages) | |
if clean: | |
self._clean_messages(renumber=True) | |
def __getitem__(self, key): | |
"""Allow accessing Message objects in the messages list using Thread[n]. | |
Beware out by one errors! The message numbers start counting at 1, | |
but the list they are stored in is indexed from 0. | |
- This behaviour could be corrected by either subtracting one from | |
the key (which causes issues when slicing), or by counting messages | |
from 0.""" | |
return self.messages[key] | |
def __contains__(self, item): | |
"""Allow checking membership of the using Message in Thread.""" | |
if isinstance(item, Message): | |
return item in self.messages | |
else: | |
return False | |
def __repr__(self): | |
"""Set Python's representation of the Thread object.""" | |
return "<THREAD: ID='{}' PEOPLE='{}', MESSAGE_COUNT={}>".format(self._id, self.people_str, len(self.messages)) | |
def __str__(self): | |
"""Return a string summary of a Thread.""" | |
return repr(self) | |
def __len__(self): | |
"""Return the total number of messages in the thread.""" | |
return len(self.messages) | |
def _clean_messages(self, *, renumber=False): | |
"""Remove messages with no content from the thread. | |
This is useful when empty messages are provided by an export | |
and need to be removed systematically.""" | |
self.messages = [m for m in self.messages if not m.is_empty_message()] | |
if renumber: | |
self._renumber_messages() | |
def _add_messages(self, new_messages): | |
"""Allow adding messages to an already created Thread object. | |
This function is useful for merging duplicate threads together.""" | |
self.messages.extend(new_messages) | |
self.messages = sorted(self.messages) | |
def _renumber_messages(self): | |
"""Renumber all messages in the 'messages' list. | |
Message objects are are sorted after being added; but if messages are | |
added using _add_messages() then the numbering may be incorrect. This | |
function fixes that.""" | |
i = 1 | |
for message in self.messages: | |
message._num = i | |
i += 1 | |
def rename_participant(self, old_name, new_name): | |
"""Change the name of a participant in the thread and all messages.""" | |
self.people = [p if not p == old_name else new_name for p in self.people] | |
self.people_str = self.people_str.replace(old_name, new_name) | |
for m in self.messages: | |
if m.author == old_name: | |
m.author = new_name | |
def merge(self, thread, *, renumber=True): | |
"""Merge another thread with the same participants into this one. | |
This is a stateful operation and modifies the original thread, | |
leaving the 'thread' argument untouched. It returns False if | |
the merge did not occur due to differing participants.""" | |
if not isinstance(thread, Thread) or set(self.people) != set(thread.people): | |
return False | |
self._add_messages(thread.messages) | |
self._id = "{0:s}&{1:s}".format(self._id, thread._id) | |
if renumber: | |
self._renumber_messages() | |
return True | |
def by(self, name): | |
"""Return a date ordered list of all messages sent by 'name'. | |
Returns a list of Message objects.""" | |
return [message for message in self.messages if message.sent_by(name)] | |
def sent_before(self, date): | |
"""Return a date ordered list of all messages sent before specified date. | |
The function returns a list of Message objects. The 'date' can be a | |
datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" | |
return [message for message in self.messages if message.sent_before(date)] | |
def sent_after(self, date): | |
"""Return a date ordered list of all messages sent after specified date. | |
The list returned is a list of Message objects. The 'date' can be a | |
datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" | |
return [message for message in self.messages if message.sent_after(date)] | |
def sent_between(self, start, end=None): | |
"""Return a date ordered list of all messages sent between specified dates. | |
- The list returned is a list of Message objects. The 'start' and 'end' | |
can be datetime.datetime objects, or a three or five tuple | |
(YYYY, MM, DD[, HH, MM]). | |
- Not entering an 'end' date is interpreted as all messages sent on | |
the day 'start'. Where a time is specified also, a 24 hour period | |
beginning at 'start' is used.""" | |
return [message for message in self.messages if message.sent_between(start, end)] | |
def search(self, string, *, ignore_case=False): | |
"""Return a date ordered list of messages in Thread containing 'string'. | |
This function searches the current thread, and returns a list of Message | |
objects. | |
- The function can be made case-insensitive by setting 'ignore_case' | |
to True.""" | |
return sorted([message for message in self.messages if message.contains(string, ignore_case=ignore_case)]) | |
def on(self, date): | |
"""Return the Thread object as it would have been on 'date'. | |
The Thread object returned is a new object containing the subset of the | |
messages sent before 'date'. | |
- 'date' can be a datetime.datetime object, or a three or five tuple | |
(YYYY, MM, DD[, HH, MM]).""" | |
return Thread(_id=self._id, people=self.people, messages=self.sent_before(date)) | |
def to_csv(self, filename=None, *, add_header=True, _file_handle=None): | |
if (filename is not None) or (_file_handle is not None): | |
# Either use existing file handle or open a new one: | |
file_handle = _file_handle or open(filename, mode='w', encoding='utf-8') | |
# Output the messages to file: | |
for i, message in enumerate(self.messages): | |
message.to_csv(add_header=(i == 0 and add_header), _file_handle=file_handle) | |
# If opened a file here, then close it: | |
if filename is not None: | |
file_handle.close() | |
elif len(self.messages) > 0: | |
if len(self.messages) > 1000: | |
print("There are more than 1000 messages, conversion to string may fail or crash Python!") | |
pass | |
return "{0}\n{1}".format(self.messages[0].to_csv(add_header=add_header), | |
"\n".join([message.to_csv(add_header=False) for message in self.messages[1:]])) | |
class Chat(object): | |
"""An object to encapsulate a group of Threads. | |
- Contains a list of Thread objects, which can be accessed using item | |
accessing Chat["Thread Name"] style. | |
- When initialising, 'myname' should be the name of the user, and 'threads' | |
should be a list of Thread objects. | |
- Provides useful functions for accessing messages.""" | |
def __init__(self, *, owner, threads): | |
if not all(isinstance(t, Thread) for t in threads): | |
raise TypeError("The thread list must be Thread objects!") | |
self._all_people = {owner} | |
self.threads = [] | |
self._thread_dict = {} | |
for thread in threads: | |
self._add_new_thread(thread) | |
self.threads = sorted(self.threads, key=len, reverse=True) | |
self._owner = owner | |
def __getitem__(self, key): | |
"""Allow accessing Thread objects in the list using Chat["Thread Name"]. | |
This method allows the threads list to be accessed using Chat["Thread Name"] | |
or Chat[n] notation.""" | |
if type(key) is int: | |
return self.threads[key] | |
elif type(key) is str: | |
return self._thread_dict[key] | |
def __contains__(self, item): | |
"""Allow checking membership of the using "Thread Name" in Chat.""" | |
if isinstance(item, Thread): | |
return item in self.threads | |
elif isinstance(item, str): | |
return item in self._thread_dict | |
else: | |
return False | |
def __repr__(self): | |
"""Set Python's representation of the Chat object.""" | |
return "<CHAT LOG: OWNER='{:s}' TOTAL_THREADS={:d} TOTAL_MESSAGES={:d}>".format(self._owner, len(self.threads), self.count_messages()) | |
def __len__(self): | |
"""Return the total number of threads. | |
Allows the len() method to be called on a Chat object. This could be | |
changed to be the total number of messages, currently stored as | |
Chat._total_messages()""" | |
return len(self.threads) | |
def _add_new_thread(self, thread): | |
"""Add a new thread to the chat object. | |
The thread will be merged with any existing thread of the same name | |
if 'clean' is True""" | |
thread_key = thread.people_str | |
# FIXME: Only one of the same named threads is accessible by name! | |
self.threads.append(thread) | |
self._thread_dict[thread_key] = thread | |
self._all_people.update(thread.people) | |
def merge(self, chat): | |
"""Merge another chat into this one, merging threads if 'clean' is True. | |
The threads must be owned by the same person to be merged.""" | |
if not isinstance(chat, Chat) or chat._owner != chat._owner: | |
print("Invalid chat object provided!") | |
return False | |
for thread in chat.threads: | |
self._add_new_thread(thread) | |
self.threads = sorted(self.threads, key=len, reverse=True) | |
def rename_thread(self, old_name, new_name, *, rename_participant=True): | |
"""Rename a thread, and optionally rename the participant too.""" | |
thread = self._thread_dict.pop(old_name) | |
if rename_participant: | |
thread.rename_participant(old_name, new_name) | |
self._thread_dict[new_name] = thread | |
def count_messages(self): | |
"""Count the total number messages. | |
Since Thread objects can be extended dynamically, this may prove | |
necessary.""" | |
return sum([len(thread) for thread in self.threads]) | |
def all_messages(self): | |
"""Return a date ordered list of all messages. | |
The list is all messages contained in the Chat object, as a list of | |
Message objects.""" | |
return sorted([message for thread in self.threads for message in thread.messages]) | |
def all_from(self, name): | |
"""Return a date ordered list of all messages sent by 'name'. | |
The list returned is a list of Message objects. This is distinct from | |
Thread.by(name) since all threads are searched by this method. For all | |
messages in one thread from 'name', use Thread.by(name) on the correct Thread.""" | |
return sorted([message for thread in self.threads for message in thread.by(name)]) | |
def sent_before(self, date): | |
"""Return a date ordered list of all messages sent before specified date. | |
The function returns a list of Message objects. The 'date' can be a | |
datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" | |
return sorted([message for thread in self.threads for message in thread.sent_before(date)]) | |
def sent_after(self, date): | |
"""Return a date ordered list of all messages sent after specified date. | |
The list returned is a list of Message objects. The 'date' can be a | |
datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" | |
return sorted([message for thread in self.threads for message in thread.sent_after(date)]) | |
def sent_between(self, start, end=None): | |
"""Return a date ordered list of all messages sent between specified dates. | |
- The list returned is a list of Message objects. The 'start' and 'end' | |
can be datetime.datetime objects, or a three or five tuple | |
(YYYY, MM, DD[, HH, MM]). | |
- Not entering an 'end' date is interpreted as all messages sent on | |
the day 'start'. Where a time is specified also, a 24 hour period | |
beginning at 'start' is used.""" | |
return sorted([message for thread in self.threads for message in thread.sent_between(start, end)]) | |
def search(self, string, *, ignore_case=False): | |
"""Return a date ordered list of all messages containing 'string'. | |
This function searches in all threads, and returns a list of Message | |
objects. | |
- The function can be made case-insensitive by setting 'ignore_case' | |
to True.""" | |
return sorted([message for thread in self.threads for message in thread.search(string, ignore_case=ignore_case)]) | |
def on(self, date): | |
"""Return the Chat object as it would have been on 'date'. | |
The Chat object returned is a new object containing the subset of the | |
Threads which contain messages sent before 'date', where each of these | |
Threads is a new Thread with only these messages in. | |
- 'date' can be a datetime.datetime object, or a three or five tuple | |
(YYYY, MM, DD[, HH, MM]).""" | |
threads_on = [t.on(date) for t in self.threads if len(t.on(date)) > 0] | |
return Chat(owner=self._owner, threads=threads_on) | |
def to_csv(self, filename=None, *, add_header=True, _file_handle=None): | |
if (filename is not None) or (_file_handle is not None): | |
print("Writing messages to CSV file '{0:s}'.".format(filename or _file_handle.name)) | |
# Either use existing file handle or open a new one: | |
file_handle = _file_handle or open(filename, mode='w', encoding='utf-8') | |
# Output the messages to file: | |
for i, thread in enumerate(self.threads): | |
thread.to_csv(add_header=(i == 0 and add_header), _file_handle=file_handle) | |
# If opened a file here, then close it: | |
if filename is not None: | |
file_handle.close() | |
elif self.count_messages() > 0: | |
if self.count_messages() > 1000: | |
print("There are more than 1000 messages, conversion to string may fail or crash Python!") | |
pass | |
return "{0}\n{1}".format(self.threads[0].to_csv(add_header=add_header), "\n".join([thread.to_csv(add_header=False) for thread in self.threads[1:]])) | |
# Some useful functions: | |
def canonicalise_facebook_name(name): | |
return "fb_{0:s}".format(name.replace(" ", "-").lower()) | |
def fix_participant(participant, thread_id, unknown_user_map): | |
global FACEBOOK_USER | |
fixed_participant = participant or FACEBOOK_USER | |
thread_key = thread_id.replace("FACEBOOK_", "") | |
if fixed_participant == FACEBOOK_USER: | |
if thread_key in unknown_user_map: | |
return unknown_user_map[thread_key] | |
else: | |
# This user is still unknown, print a message suggesting adding an entry to the | |
# map that fixes up unknown users. Of course, Facebook merge all deleted/removed | |
# users together, so this may really be several people not just one . . . | |
print(" - Unknown user in thread: '{0:s}'.".format(thread_id)) | |
print(" Add \"{0:s}\": \"Real Name\" to the UNKNOWN_USER_MAP to remove this warning.".format(thread_key)) | |
UNKNOWN_USER_MAP[thread_key] = fixed_participant | |
return fixed_participant | |
def fix_participants(participants, thread_id, unknown_user_map): | |
return [fix_participant(p, thread_id, unknown_user_map) for p in participants] | |
# Open the zip file: | |
print("Opening zipfile.") | |
zip_archive = zipfile.ZipFile(FACEBOOK_ZIPFILE) | |
message_files = [f for f in zip_archive.namelist() if f.startswith(ZIP_MESSAGE_DIR) and f.endswith(".json")] | |
# Process the messages: | |
print("Processing message threads.") | |
thread_list = [] | |
for message_file in message_files: | |
message_json = json.loads(json.dumps(json.load(zip_archive.open(message_file)), ensure_ascii=False).encode("latin-1").decode("utf-8")) | |
if not message_json.get("is_still_participant"): | |
# Skip message threads where we have opted to leave them. | |
continue | |
thread_id = THREAD_ID_TEMPLATE.format(message_json.get("thread_path")) | |
participants = [p["name"] for p in message_json.get("participants", [])] | |
participants = fix_participants(participants, thread_id, UNKNOWN_USER_MAP) | |
message_list = [] | |
for n, message in enumerate(message_json.get("messages", [])[::-1]): # Inline list reversal into ascending order! | |
if message.get("content") is None: | |
# Skip blank messages. | |
continue | |
message_text = message.get("content") | |
message_time = datetime.datetime.fromtimestamp(message["timestamp_ms"] / 1000, local_timezone).astimezone(pytz.utc) | |
author_name = fix_participant(message.get("sender_name"), thread_id, UNKNOWN_USER_MAP) | |
if author_name not in participants: | |
# Because not listing people who left the conversation is so helpful! | |
participants.append(author_name) | |
message_list.append(Message(source=MESSAGE_TYPE, thread_id=thread_id, author=author_name, date_time=message_time, text=message_text, number=n)) | |
thread_list.append(Thread(_id=thread_id, owner=MY_FACEBOOK_NAME, people=participants, messages=message_list)) | |
chat = Chat(owner=MY_FACEBOOK_NAME, threads=thread_list) | |
print("Created 'chat' object.") | |
# Close files: | |
zip_archive.close() | |
# Do some stuff with the "chat" object: | |
print(chat) | |
print(facebook_analysis_demo.top_n_people(chat, 15)) | |
facebook_analysis_demo.messages_date_graph(chat) |
Thank you!
Dangit - stops at line 81 - Syntax error: invalid syntax
def __init__(self, *, source, thread_id, author, date_time, text, number, attachments=None):
This is now Python 3, unlike the repository code that was Python 2. You'll need Python 3.6 or later to run it; Anaconda ought to allow you to create a virtual environment, or you could just move to Python 3 entirely since Python 2 is nearing its end of life.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You'll want to change Lines 36-38 of
facebook_parser_demo.py
to be your name, your timezone name and the name of your Facebook JSON export zipfile. There may be some odd timezone issues, this is a combination of old and new code, and they both did slightly different things to get everything into UTC for storing.The code tries to cope with Facebook removing the names of people who have deactiviated or deleted their accounts, and uses a dictionary (
UNKNOWN_USER_MAP
) to go from thread ID to an actual name. The code will print suggestions of more threads with unknown people in; you can add their IDs to this dictionary to replace theFacebook User
with a person's name. Be aware that Facebook group all unknown people together with the same name, so threads with multiple participants may cause issues . . .To get a CSV export, use
chat.to_csv("export.csv")
(just replace everything after Line 625 offacebook_parser_demo.py
with that one line).Facebook keep changing their export format, and I can't offer much support if it should change again. This code works on the format used in August and September 2018. Some bits are better documented than others; the newest code is Lines 586-L613 of the parser, which changes each time Facebook change something.