Skip to content

Instantly share code, notes, and snippets.

@jz-feng
Created April 28, 2021 08:14
Show Gist options
  • Select an option

  • Save jz-feng/06850aee2213e871d66a413dc3c6f2b0 to your computer and use it in GitHub Desktop.

Select an option

Save jz-feng/06850aee2213e871d66a413dc3c6f2b0 to your computer and use it in GitHub Desktop.
from datetime import datetime
from datetime import timedelta
from itertools import islice
from os import walk
import csv
import json
import sys
# --histo for simple msg frequency from a particular chat (point it to the specific directory)
# --ts for time series data, ie 2d array of [date, person]
# --ts --dm to only count direct messages (excl. group chats)
DIRPATH = "messages/inbox/"
def readfile(dir_path, filename):
print("reading ", dir_path + filename)
with open(dir_path + filename) as f:
data = f.read()
return data
def dump(data):
print(json.dumps(data, sort_keys=True, indent=2))
def time_series(dms=False):
dir_suffixes = ["audio", "files", "gifs", "photos", "videos", "thumbnails"]
data = [
[readfile(dirpath + "/", filename) for filename in filenames]
for dirpath, _, filenames in walk(DIRPATH)
if not any([dirpath.endswith(suffix) for suffix in dir_suffixes])
]
flattened_data = [d for ds in data for d in ds]
# only count msgs from top N all-time senders
most_freq = [p for p in dict(islice(histo(flattened_data, dms).items(), 15))]
most_freq_cols = {}
for i in range(len(most_freq)):
most_freq_cols[most_freq[i]] = i
series = {}
for json_chunk in flattened_data:
chunk = json.loads(json_chunk)
if dms and len(chunk["participants"]) > 2:
continue
for msg in chunk["messages"]:
sender = msg["sender_name"]
time = msg["timestamp_ms"]
# granularity = monthly
date = datetime.fromtimestamp(time / 1000).date().replace(day=1).isoformat()
if date not in series:
series[date] = [0 for p in most_freq]
elif sender in most_freq:
series[date][most_freq_cols[sender]] += 1
# dump(series)
with open("output.csv", "w") as outfile:
w = csv.writer(outfile, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
w.writerow(["senders"] + [p for p in most_freq])
for date in series:
w.writerow([date] + series[date])
def histo(data, dms=False):
freq = {}
for json_chunk in data:
chunk = json.loads(json_chunk)
if dms and len(chunk["participants"]) > 2:
continue
for msg in chunk["messages"]:
sender = msg["sender_name"]
if sender not in freq:
freq[sender] = 1
else:
freq[sender] += 1
return dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))
def main():
if len(sys.argv) < 2:
print("command plz")
return
cmd = sys.argv[1]
if cmd == "--ts":
time_series(len(sys.argv) > 2 and sys.argv[2] == "--dm")
elif cmd == "--histo":
if len(sys.argv) < 3:
print("file name plz")
return
path = sys.argv[2]
if path[-1] != "/":
path += "/"
_, _, filenames = next(walk(path))
data = [readfile(path, f) for f in filenames]
freq = histo(data)
print(freq)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment