Created
April 28, 2021 08:14
-
-
Save jz-feng/06850aee2213e871d66a413dc3c6f2b0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from datetime import datetime | |
| from datetime import timedelta | |
| from itertools import islice | |
| from os import walk | |
| import csv | |
| import json | |
| import sys | |
| # --histo for simple msg frequency from a particular chat (point it to the specific directory) | |
| # --ts for time series data, ie 2d array of [date, person] | |
| # --ts --dm to only count direct messages (excl. group chats) | |
| DIRPATH = "messages/inbox/" | |
| def readfile(dir_path, filename): | |
| print("reading ", dir_path + filename) | |
| with open(dir_path + filename) as f: | |
| data = f.read() | |
| return data | |
| def dump(data): | |
| print(json.dumps(data, sort_keys=True, indent=2)) | |
| def time_series(dms=False): | |
| dir_suffixes = ["audio", "files", "gifs", "photos", "videos", "thumbnails"] | |
| data = [ | |
| [readfile(dirpath + "/", filename) for filename in filenames] | |
| for dirpath, _, filenames in walk(DIRPATH) | |
| if not any([dirpath.endswith(suffix) for suffix in dir_suffixes]) | |
| ] | |
| flattened_data = [d for ds in data for d in ds] | |
| # only count msgs from top N all-time senders | |
| most_freq = [p for p in dict(islice(histo(flattened_data, dms).items(), 15))] | |
| most_freq_cols = {} | |
| for i in range(len(most_freq)): | |
| most_freq_cols[most_freq[i]] = i | |
| series = {} | |
| for json_chunk in flattened_data: | |
| chunk = json.loads(json_chunk) | |
| if dms and len(chunk["participants"]) > 2: | |
| continue | |
| for msg in chunk["messages"]: | |
| sender = msg["sender_name"] | |
| time = msg["timestamp_ms"] | |
| # granularity = monthly | |
| date = datetime.fromtimestamp(time / 1000).date().replace(day=1).isoformat() | |
| if date not in series: | |
| series[date] = [0 for p in most_freq] | |
| elif sender in most_freq: | |
| series[date][most_freq_cols[sender]] += 1 | |
| # dump(series) | |
| with open("output.csv", "w") as outfile: | |
| w = csv.writer(outfile, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) | |
| w.writerow(["senders"] + [p for p in most_freq]) | |
| for date in series: | |
| w.writerow([date] + series[date]) | |
| def histo(data, dms=False): | |
| freq = {} | |
| for json_chunk in data: | |
| chunk = json.loads(json_chunk) | |
| if dms and len(chunk["participants"]) > 2: | |
| continue | |
| for msg in chunk["messages"]: | |
| sender = msg["sender_name"] | |
| if sender not in freq: | |
| freq[sender] = 1 | |
| else: | |
| freq[sender] += 1 | |
| return dict(sorted(freq.items(), key=lambda item: item[1], reverse=True)) | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("command plz") | |
| return | |
| cmd = sys.argv[1] | |
| if cmd == "--ts": | |
| time_series(len(sys.argv) > 2 and sys.argv[2] == "--dm") | |
| elif cmd == "--histo": | |
| if len(sys.argv) < 3: | |
| print("file name plz") | |
| return | |
| path = sys.argv[2] | |
| if path[-1] != "/": | |
| path += "/" | |
| _, _, filenames = next(walk(path)) | |
| data = [readfile(path, f) for f in filenames] | |
| freq = histo(data) | |
| print(freq) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment