Skip to content

Instantly share code, notes, and snippets.

@mistermichaelll
Last active August 24, 2023 16:46
Show Gist options
  • Save mistermichaelll/3afcc33188f29576915243a5a2ea2a72 to your computer and use it in GitHub Desktop.
Save mistermichaelll/3afcc33188f29576915243a5a2ea2a72 to your computer and use it in GitHub Desktop.
# This script is a first pass at creating a clean Hinge dataset that one can use to
# visualize using Python or R.
#
# You can request your Hinge data from the app, and the file in question that we're
# using here is "matches.json".
#
# Author: Michael Johnson
# Last Updated: June 2, 2021
#
# ==================================
# Project Setup
# ==================================
# libraries utilized
import os
import json
import pandas as pd
import datetime as dt
# open the Hinge matches JSON file.
cd = "users/name/Hinge Export Folder/"
os.chdir(cd)
with open("matches.json") as m:
matches = json.load(m)
# normalize the JSON using pandas
data = pd.json_normalize(matches)
# ==================================
# Gathering the Data
# ==================================
# We are going to pull apart each type of "like" you can receive on the app from the
# messy JSON file that Hinge sends us.
# Basically, the logic I'm using is as follows:
#
# - If "like" is not null, and "match" is not null, then this is where we sent a like and got a match.
# - If "like" is not null, and "match" is null, then this is where we sent a like and got no match.
# - If "match" is not null, and "like" is null, then this is where we received a like and got a match.
# - If "match" is null, and "like" is null, then this is where we redeived a like but did not match.
# outgoing likes, match
outgoing_matches = data.loc[(data["like"].isna() == False) & (data["match"].isna() == False)].reset_index()
# outgoing likes, no match
outgoing_no_matches = data.loc[(data["like"].isna() == False) & (data["match"].isna() == True)].reset_index()
# incoming likes, match
incoming_match = data.loc[(data["match"].isna() == False) & (data["like"].isna() == True)].reset_index()
# incoming likes, no match
incoming_no_match = data.loc[(data["like"].isna() == True) & (data["match"].isna() == True)].reset_index()
# Here's a quick sanity check for if we're categorizing things correctly.
# The sum of the length of the 4 categories should be equal to the sum of
# the length of the original data.
# ------------------------------------------------------------------------
# num_items_categorized = len(outgoing_matches) + len(outgoing_no_matches) + len(incoming_match) + len(incoming_no_match)
# num_items_og_data = len(data)
#
# print("Num. Categorized Items: ", num_items_categorized)
# print("Num. Items OG Data: ", num_items_og_data)
#
# if num_items_categorized == num_items_og_data:
# print("The number of items is the same.")
# else:
# print("The number of items is not the same. Check your work")
# ==================================
# Quick Stats
# -----------
# This prints out some quick stats
# from the data we're working with.
# ==================================
print("Total Likes Sent:", len(outgoing_matches) + len(outgoing_no_matches))
print("Total Matches from Likes Sent:", len(outgoing_matches))
print("Match % from Likes Sent:", round(len(outgoing_matches) / (len(outgoing_matches) + len(outgoing_no_matches)) * 100), "%")
print("Total Likes Received:", len(incoming_match) + len(incoming_no_match))
print("Total Matches from Likes Received:", len(incoming_match))
print("Match % from Likes Received:", round(len(incoming_match) / (len(incoming_match) + len(incoming_no_match)) * 100), "%")
# ==================================
# Creating a Clean Dataframe
# --------------------------
# Really, all I want is a clean
# dataframe that has the timestamp,
# the date without the time, and
# the type of like we're dealing with.
# ==================================
# Create lists of timestamps from our data
# sent likes, match
# -----------------
sent_like_timestamps = []
for i in range(0, len(outgoing_matches)):
sent_like_timestamps.append(outgoing_matches["like"][i][0]["timestamp"])
# sent like, no match
# --------------------
sent_like_no_match_timestamps = []
for i in range(0, len(outgoing_no_matches)):
sent_like_no_match_timestamps.append(outgoing_no_matches["like"][i][0]["timestamp"])
# received like, no match
rec_no_match_timestamps = []
for i in range(0, len(incoming_no_match)):
rec_no_match_timestamps.append(incoming_no_match["block"][i][0]["timestamp"])
# received like, match
rec_match_timestamps = []
for i in range(0, len(incoming_match)):
rec_match_timestamps.append(incoming_match["match"][i][0]["timestamp"])
# take these lists of timestamps, create a dataframe with the timestamp and "like type"
sent_match = pd.DataFrame({"Timestamp":sent_like_timestamps, "Type":"Sent Like, Match"})
sent_no_match = pd.DataFrame({"Timestamp":sent_like_no_match_timestamps, "Type":"Sent Like, No Match"})
rec_match = pd.DataFrame({"Timestamp":rec_match_timestamps, "Type":"Received Like, Match"})
rec_no_match = pd.DataFrame({"Timestamp":rec_no_match_timestamps, "Type":"Received Like, No Match"})
dfs = [sent_match, sent_no_match, rec_match, rec_no_match]
# create clean dataset
clean_data = pd.concat(dfs).reset_index().drop('index', axis = 1)
clean_data["Timestamp"] = pd.to_datetime(clean_data["Timestamp"])
clean_data["Date"] = clean_data["Timestamp"].dt.date
# export this data so we can use it elsewhere
clean_data.to_csv("~/Downloads/Hinge Export/clean_data.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment