Skip to content

Instantly share code, notes, and snippets.

@reallytiredofclowns
Created April 1, 2024 16:28
Show Gist options
  • Save reallytiredofclowns/b26a546ffbd0ca9e340ce1f352b9b0f3 to your computer and use it in GitHub Desktop.
Save reallytiredofclowns/b26a546ffbd0ca9e340ce1f352b9b0f3 to your computer and use it in GitHub Desktop.
Discuit user domination stats from data dump
# disc domination
import pandas
fullDF = pandas.read_csv("d:/docs/download/discuitdump/concat.csv")
fullDF = fullDF.query("deleted != True")[["disc", "user"]]
discUser = (fullDF
.groupby(["disc", "user"], as_index = False)
.size()
.rename(columns = {"size": "userItems"}))
disc = (discUser[["disc", "userItems"]]
.groupby("disc")
.sum()
.rename(columns = {"userItems": "discItems"}))
dominators = disc.merge(discUser, on = "disc").sort_values(["disc", "userItems"], ascending = [True, False])
dominators["userPCT"] = 100 * dominators["userItems"] / dominators["discItems"]
dominators = dominators.drop_duplicates(["disc"], keep = "first")
print(dominators\
.query("(userPCT >= 50) & (discItems >= 100)")
.sort_values("discItems", ascending = False)
.to_markdown(index = False))
print(dominators\
.query("discItems >= 1000")
.sort_values("discItems", ascending = False)
.to_markdown(index = False))
##############################################################
# diversity of interests (# discs activity per user)
fullDF = pandas.read_csv("d:/docs/download/discuitdump/concat.csv")
fullDF = fullDF.query("deleted != True")[["disc", "user", "postType"]]
userDiscCount = (fullDF
.drop_duplicates()
.groupby("user", as_index = False)
.size())
userDiscCount["rank"] = userDiscCount["size"].rank(method = "min", ascending = False)
userDiscCount = userDiscCount.sort_values("rank")
print(userDiscCount.query("rank <= 10").to_markdown(index = False))
userDiscCount_Posts = (fullDF
.query("postType != 'c'")
.drop_duplicates()
.groupby("user", as_index = False)
.size())
userDiscCount_Posts["rank"] = userDiscCount_Posts["size"].rank(method = "min", ascending = False)
userDiscCount_Posts = userDiscCount_Posts.sort_values("rank")
print(userDiscCount_Posts.query("rank <= 10").to_markdown(index = False))
userDiscs = (userDiscCount
.rename(columns = {"size": "numDiscs"})
.sort_values("numDiscs")
.groupby("numDiscs", as_index = False)
.size()
.rename(columns = {"size": "users"}))
userDiscs["cumulativeUsers"] = userDiscs["users"].cumsum()
totalUsers = userDiscs.iloc[-1]["cumulativeUsers"]
userDiscs["percent"] = 100 * userDiscs["users"] / totalUsers
userDiscs["cumulativePercent"] = userDiscs["percent"].cumsum()
print(userDiscs.iloc[:25].to_markdown(index = False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment