Created
March 29, 2024 15:04
-
-
Save reallytiredofclowns/c14d1e3d31eda37863dd410eae025065 to your computer and use it in GitHub Desktop.
Charting active Discuit daily, 2-daily, weekly users
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
fullDF = pandas.read_csv("c:/users/bobloblaw/downloads/datadump.csv") | |
fullDF["date"] = fullDF["createdAt"].str.slice(0, 10).str.replace("-", "") | |
# for completeness, want to ensure every date is represented, | |
# but this should be true as there was daily activity | |
# DateRange = pandas.date_range(start = "20230616", end = "20240229") | |
# DateRange = pandas.DataFrame(index = DateRange) | |
# DateRange = DateRange.reset_index() | |
# DateRange["index"] = DateRange["index"].astype(str).str.replace("-", "") | |
# m = DateRange.merge(FullDF, left_on = "index", right_on = "date", indicator = "z").query("z != 'both'") | |
# unique user + date | |
userDays = fullDF[["user", "date"]].drop_duplicates().sort_values(["user", "date"]) | |
# goal is to get a chart with horizontal axis time, and vertical axis | |
# counting number of users with daily activity (date had comment/post) | |
# activity in last two days (current day or previous day had activity) | |
# or activity in the last week | |
userDays["active"] = 1 | |
userDays = userDays.query("date <= '20240229'").pivot(columns = "user", index = "date", values = "active").fillna(0) | |
# rolling sum to get total activiy over period x | |
activity1 = userDays.rolling(1).sum() | |
activity2 = userDays.rolling(2).sum() | |
activity7 = userDays.rolling(7).sum() | |
# convert to 0/1 indicator variable | |
activity1 = (activity1 > 0).astype(int) | |
activity2 = (activity2 > 0).astype(int) | |
activity7 = (activity7 > 0).astype(int) | |
# off-by-one error if directly assigning result to column, so | |
# separate the calcuation with integration into the DF | |
newCol = activity1.sum(axis = 1) | |
activity1["userCount1"] = newCol | |
newCol = activity2.sum(axis = 1) | |
activity2["userCount2"] = newCol | |
newCol = activity7.sum(axis = 1) | |
activity7["userCount7"] = newCol | |
mergedDFs = activity1[["userCount1"]].join(activity2[["userCount2"]].join(activity7[["userCount7"]])) | |
mergedDFs.plot(figsize = (20, 10)) | |
# are the DFs sorted by date on the index? | |
# In [69]: mergedDFs.equals(mergedDFs.sort_index()) | |
# Out[69]: True | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment