Skip to content

Instantly share code, notes, and snippets.

@altilunium
Last active October 14, 2025 22:32
Show Gist options
  • Save altilunium/77e6d1a2e93699b9888a5a54422d4d93 to your computer and use it in GitHub Desktop.
Save altilunium/77e6d1a2e93699b9888a5a54422d4d93 to your computer and use it in GitHub Desktop.
import os
import json
import praw
import networkx as nx
import pandas as pd
from urllib.parse import urlparse
from dash import Dash, html, dcc, dash_table
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
from collections import Counter
import nltk
nltk.download('stopwords')
global pair_scores
pair_scores = {}
# ================= CONFIGURATION ==================
# https://www.reddit.com/prefs/apps/
CLIENT_ID = "ENTER HERE"
CLIENT_SECRET = "ENTER HERE"
USER_AGENT = "ENTER HERE"
CACHE_DIR = "reddit_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
reddit = praw.Reddit(
client_id=CLIENT_ID,
client_secret=CLIENT_SECRET,
user_agent=USER_AGENT
)
# ================= UTILITIES ==================
def extract_submission_id(url):
path_parts = urlparse(url).path.strip("/").split("/")
try:
idx = path_parts.index("comments") + 1
return path_parts[idx]
except (ValueError, IndexError):
raise ValueError(f"Invalid Reddit URL: {url}")
def fetch_or_load_submission(url):
"""Fetch thread comments from Reddit or load from cache"""
submission_id = extract_submission_id(url)
cache_path = os.path.join(CACHE_DIR, f"{submission_id}.json")
if os.path.exists(cache_path):
print(f"Loading cached thread: {submission_id}")
print(cache_path)
with open(cache_path, "r", encoding="utf-8") as f:
return json.load(f)
submission = reddit.submission(id=submission_id)
submission.comments.replace_more(limit=None)
print(f"Downloading: {submission.title}")
data = []
for comment in submission.comments.list():
if comment.author:
data.append({
"id": comment.id,
"author": str(comment.author),
"parent_id": comment.parent_id,
"body": comment.body
})
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return data
def analyze_word_frequency(data, n=1000):
"""
Analyzes all comment bodies for word frequency.
Returns a DataFrame of the top 'n' words.
"""
import re
from nltk.corpus import stopwords
# NOTE: You may need to download NLTK resources:
# import nltk; nltk.download('stopwords')
# Aggregate all text
all_text = " ".join([c.get("body", "") for c in data if c.get("body")])
# Basic text cleaning: lowercase and remove non-alphanumeric characters
text = all_text.lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
# Tokenize and remove stop words
words = text.split()
# Common English and Indonesian stop words (adjust as needed)
stop_words = set(stopwords.words('english'))
# Minimal Indonesian stop words as NLTK might not be configured for it
# You might want to use a more complete list (e.g., from sastrawi) for r/indonesia
indonesian_stopwords = {"yang", "dan", "di", "ke", "dari", "ini", "itu", "atau", "untuk", "dengan", "adalah", "tidak", "ya", "saja", "udah", "lagi", "pun"}
stop_words.update(indonesian_stopwords)
filtered_words = [word for word in words if word not in stop_words and len(word) > 1 and not word.isdigit()]
word_counts = Counter(filtered_words)
# Convert to DataFrame
df_words = pd.DataFrame(word_counts.most_common(n), columns=["Word", "Frequency"])
return df_words
def analyze_reddit_pair_interactions(data):
"""
Analyze comment threads to find which user pairs interacted the most deeply.
A pair (A, B) is counted when they exchange alternating replies (A→B→A→B...).
"""
# Build parent→children mapping
global pair_scores
children = {}
by_id = {}
for c in data:
cid = c["id"]
by_id[cid] = c
parent = c["parent_id"].split("_")[1] if c["parent_id"].startswith("t1_") else None
if parent:
children.setdefault(parent, []).append(c)
pair_depths = Counter()
def traverse(node_id, last_author=None, current_pair=None):
node = by_id.get(node_id)
if not node:
return
author = node["author"]
if current_pair and last_author and author != last_author:
normalized_pair = tuple(sorted(current_pair))
pair_depths[normalized_pair] += 1
next_pair = (author, last_author)
else:
next_pair = (author, last_author) if last_author else None
for child in children.get(node_id, []):
traverse(child["id"], last_author=author, current_pair=next_pair)
# Start traversal from top-level comments
for c in data:
if c["parent_id"].startswith("t3_"): # top-level
traverse(c["id"], None, None)
pair_scores = {tuple(sorted((a, b))): count for (a, b), count in pair_depths.items()}
# Convert to DataFrame
df_pairs = pd.DataFrame([
{"User A": a, "User B": b, "Interactions": count}
for (a, b), count in pair_depths.items()
])
if df_pairs.empty:
return pd.DataFrame(columns=["User A", "User B", "Interactions"])
return df_pairs.sort_values(by="Interactions", ascending=False).reset_index(drop=True)
def build_graph_from_data(data):
global pair_scores
G = nx.DiGraph()
id_to_author = {c["id"]: c["author"] for c in data}
for c in data:
if c["parent_id"].startswith("t1_"):
parent_id = c["parent_id"].split("_")[1]
if parent_id in id_to_author:
parent_author = id_to_author[parent_id]
child_author = c["author"]
if parent_author != child_author:
weight = pair_scores.get(tuple(sorted((child_author, parent_author))), 1)
G.add_edge(child_author, parent_author, weight=weight)
return G
def analyze_graph(G):
in_deg = dict(G.in_degree())
out_deg = dict(G.out_degree())
df = pd.DataFrame({
"user": list(G.nodes),
"popularity": [in_deg.get(u, 0) for u in G.nodes],
"interactivity": [out_deg.get(u, 0) for u in G.nodes],
})
df["total_degree"] = df["popularity"] + df["interactivity"]
df = df.sort_values("total_degree", ascending=False).reset_index(drop=True)
return df
# ===== NEW FUNCTION: user pair analysis =====
def analyze_user_pairs(G):
# Convert directed edges into undirected pairs for counting mutual interactions
edges = [tuple(sorted((u, v))) for u, v in G.edges()]
pair_counts = Counter(edges)
df_pairs = pd.DataFrame(pair_counts.items(), columns=["User Pair", "Interactions"])
df_pairs[["User A", "User B"]] = pd.DataFrame(df_pairs["User Pair"].tolist(), index=df_pairs.index)
df_pairs = df_pairs[["User A", "User B", "Interactions"]]
df_pairs = df_pairs.sort_values(by="Interactions", ascending=False).reset_index(drop=True)
return df_pairs
def build_interactive_plot(G):
pos = nx.spring_layout(G, k=0.5, iterations=50, dim=2, seed=42, weight='weight')
edge_x, edge_y = [], []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x += [x0, x1, None]
edge_y += [y0, y1, None]
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
)
node_x, node_y, text = [], [], []
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
text.append(node)
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers',
text=text,
textposition='top center',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='YlGnBu',
reversescale=True,
color=[len(list(G.neighbors(n))) for n in G.nodes()],
size=10,
colorbar=dict(
thickness=10,
title='Connections',
xanchor='left',
titleside='right'
),
line_width=1
)
)
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title="Reddit User Interaction Graph",
title_x=0.5,
showlegend=False,
hovermode='closest',
margin=dict(b=0, l=0, r=0, t=40),
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
))
return fig
# ================= MAIN APP ==================
def main(urls):
all_data = []
for url in urls:
data = fetch_or_load_submission(url)
all_data.extend(data)
unique_users = len(set(c["author"] for c in all_data))
df_word_freq = analyze_word_frequency(all_data, n=1000)
print(df_word_freq)
df_pairs = analyze_reddit_pair_interactions(all_data)
total_interactions = df_pairs['Interactions'].sum() if not df_pairs.empty else 0
G = build_graph_from_data(all_data)
df = analyze_graph(G)
#df_pairs = analyze_user_pairs(G) # new analysis
all_data_global = all_data
top_pairs = df_pairs.head(2)
fig = build_interactive_plot(G)
app = Dash(__name__)
app.layout = html.Div([
html.H1("Reddit User Interaction Analyzer", style={'textAlign': 'center'}),
html.H3(f"Total Unique Users Analyzed: {unique_users}", style={'textAlign': 'center', 'color': '#007BFF'}),
html.H3(f"Total Pair Interactions (A↔B Exchanges): {total_interactions}", style={'textAlign': 'center', 'color': '#28a745'}),
html.Div([
html.H3("Reddit User Interaction Graph", style={'textAlign': 'center'}),
html.Div([
# Left side — Graph
html.Div([
dcc.Graph(id="interaction-graph", figure=fig, style={
'height': '700px',
'width': '800px',
'border': '1px solid #ccc',
'borderRadius': '8px',
'padding': '5px'
}),
], style={'flex': '2', 'marginRight': '15px'}),
# Right side — Selected User Info
html.Div([
html.H3("Selected User Connections", style={'textAlign': 'center'}),
html.Div(id="selected-user-info", style={
'fontWeight': 'bold',
'marginBottom': '10px',
'textAlign': 'center'
}),
dash_table.DataTable(
id="connections-table",
columns=[
{"name": "Direction", "id": "direction"},
{"name": "Connected User", "id": "connected_user"},
{"name": "Interactions", "id": "interactions"}
],
page_size=300,
style_table={'overflowY': 'auto', 'height': '600px'},
style_cell={'textAlign': 'left', 'padding': '6px'},
style_header={'fontWeight': 'bold', 'backgroundColor': '#f0f0f0'},
)
], style={
'flex': '1',
'border': '1px solid #ddd',
'borderRadius': '8px',
'padding': '10px',
'backgroundColor': '#fafafa',
'boxShadow': '0 0 6px rgba(0,0,0,0.1)'
}),
], style={
'display': 'flex',
'flexDirection': 'row',
'justifyContent': 'space-between',
'alignItems': 'flex-start'
})
], style={'marginBottom': '40px'}),
html.H2("User Interaction Summary", style={'textAlign': 'center'}),
dash_table.DataTable(
id="main-table",
columns=[{"name": c, "id": c} for c in df.columns],
data=df.to_dict("records"),
page_size=100,
sort_action="native",
style_table={'overflowX': 'auto'},
style_cell={'textAlign': 'left', 'padding': '6px'}
),
html.H2("Most Interactive User Pairs (from real comment threads)",
style={'textAlign': 'center', 'marginTop': '40px'}),
html.Div(
f"Top two pairs: "
f"{df_pairs.iloc[0]['User A']} ↔ {df_pairs.iloc[0]['User B']} "
f"({df_pairs.iloc[0]['Interactions']} exchanges) and "
f"{df_pairs.iloc[1]['User A']} ↔ {df_pairs.iloc[1]['User B']} "
f"({df_pairs.iloc[1]['Interactions']} exchanges)."
if not df_pairs.empty else "No significant user pairs found.",
style={'textAlign': 'center', 'fontWeight': 'bold', 'marginBottom': '10px'}
),
dash_table.DataTable(
id="pair-thread-table",
columns=[{"name": c, "id": c} for c in df_pairs.columns],
data=df_pairs.to_dict("records"),
page_size=500,
sort_action="native",
style_table={'overflowX': 'auto'},
style_cell={'textAlign': 'left', 'padding': '6px'},
style_header={'fontWeight': 'bold', 'backgroundColor': '#f0f0f0'}
),
html.H2("Top 1000 Word Frequency List (Excluding Stop Words)",
style={'textAlign': 'center', 'marginTop': '40px'}),
dash_table.DataTable(
id="word-freq-table",
columns=[{"name": c, "id": c} for c in df_word_freq.columns],
data=df_word_freq.to_dict("records"),
page_size=1000,
style_table={'overflowX': 'auto', 'marginBottom': '40px'},
style_cell={'textAlign': 'left', 'padding': '6px'},
style_header={'fontWeight': 'bold', 'backgroundColor': '#f0f0f0'}
)
])
@app.callback(
[Output("selected-user-info", "children"),
Output("connections-table", "data")],
[Input("interaction-graph", "clickData")],
[State("interaction-graph", "figure")]
)
def display_node_connections(clickData, figure):
if not clickData or "points" not in clickData:
return "Click a node to view connections", []
clicked_user = clickData["points"][0]["text"]
if clicked_user not in G.nodes:
return f"{clicked_user} not found in graph.", []
# Count interactions
in_counts = {}
out_counts = {}
for u, v in G.edges():
if v == clicked_user: # incoming edge (user replied to clicked_user)
in_counts[u] = in_counts.get(u, 0) + 1
elif u == clicked_user: # outgoing edge (clicked_user replied to user)
out_counts[v] = out_counts.get(v, 0) + 1
data = []
for user, count in sorted(in_counts.items(), key=lambda x: x[1], reverse=True):
data.append({
"direction": "Incoming (replied by)",
"connected_user": user,
"interactions": pair_scores.get(tuple(sorted((user, clicked_user))), 0)
})
for user, count in sorted(out_counts.items(), key=lambda x: x[1], reverse=True):
data.append({
"direction": "Outgoing (replied to)",
"connected_user": user,
"interactions": pair_scores.get(tuple(sorted((user, clicked_user))), 0)
})
data = sorted(data, key=lambda x: x["interactions"], reverse=True)
return f"Connections for user: {clicked_user}", data
# Optional static export
app.run_server(debug=True)
if __name__ == "__main__":
urls = [
"https://www.reddit.com/r/indonesia/comments/1o2maf5/10_october_2025_daily_chat_thread/"
]
main(urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment