Skip to content

Instantly share code, notes, and snippets.

@rothnic
Created April 8, 2026 20:11
Show Gist options
  • Select an option

  • Save rothnic/ecb981a1404b6b8ec8df3607bf87e15a to your computer and use it in GitHub Desktop.

Select an option

Save rothnic/ecb981a1404b6b8ec8df3607bf87e15a to your computer and use it in GitHub Desktop.
Reddit Content Processor - Extracts knowledge from saved posts for project research (not data archival)
#!/usr/bin/env python3
"""
Reddit Content Processor for Project Research
This script processes saved Reddit content to extract knowledge for
active projects - not to archive Reddit data, but to extract useful
references, URLs, and entities from content I've explicitly saved.
Processing Pipeline:
1. Fetch saved posts (content I've explicitly bookmarked)
2. Extract URLs from post content
3. Parse named entities (projects, tools, people mentioned)
4. Categorize by topic/project relevance
5. Add to project-specific knowledge graphs
This is similar to:
- Saving a research paper to Zotero and extracting citations
- Bookmarking a tutorial and noting the tools used
- Saving a discussion and extracting key recommendations
NOT a data archive - a research workflow tool.
"""
import praw
import json
import re
from datetime import datetime
from typing import List, Dict, Optional, Set
from dataclasses import dataclass
from urllib.parse import urlparse
@dataclass
class ProcessedContent:
"""Extracted knowledge from a Reddit post, not the post itself."""
source_id: str # Reddit post ID (for attribution only)
source_url: str # Permalink (for attribution only)
# Extracted knowledge (the actual value)
external_urls: List[str] # URLs found in post content
mentioned_projects: List[str] # Project names mentioned
mentioned_tools: List[str] # Tools/software mentioned
topics: List[str] # Inferred topics/tags
relevance_notes: str # Why this is relevant to my projects
# Metadata
processed_at: str
relevant_project: Optional[str] # Which of my projects this supports
class RedditContentProcessor:
"""
Processes saved Reddit content to extract actionable knowledge.
This is NOT an archive. We don't store Reddit posts.
We extract URLs, references, and entities from posts I've saved,
then integrate those into project knowledge bases.
"""
# Projects I'm actively working on
MY_PROJECTS = [
"knowledge-management-system",
"ai-agent-orchestration",
"family-automation",
"llm-evaluation",
"devops-tooling"
]
# Keywords that indicate relevance to my projects
PROJECT_KEYWORDS = {
"knowledge-management-system": [
"knowledge graph", "semantic web", "ontology", "rdf", "sparql",
"note taking", "zettelkasten", "obsidian", "personal wiki"
],
"ai-agent-orchestration": [
"langchain", "autogen", "crewai", "agent framework",
"llm orchestration", "multi-agent", "agent workflow"
],
"family-automation": [
"home automation", "smart home", "calendar sync",
"school tracker", "family organizer"
],
"llm-evaluation": [
"lm eval", "benchmark", "mmlu", "gsm8k", "human eval",
"model evaluation", "llm metrics"
],
"devops-tooling": [
"kubernetes", "docker", "terraform", "ansible",
"ci/cd", "github actions", "deployment"
]
}
def __init__(self, client_id: str, client_secret: str, username: str, password: str):
"""Initialize with Reddit API credentials."""
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent="ContentProcessor/1.0 (Research Workflow Tool)",
username=username,
password=password
)
def extract_urls(self, text: str) -> List[str]:
"""Extract all URLs from text content."""
url_pattern = r'https?://[^\s<>"\']{3,}'
urls = re.findall(url_pattern, text)
# Clean and dedupe
cleaned = []
for url in urls:
url = url.rstrip('.,;:!?)')
if url not in cleaned:
cleaned.append(url)
return cleaned
def extract_entities(self, text: str, title: str) -> Dict[str, List[str]]:
"""Extract named entities (projects, tools) from content."""
combined = f"{title} {text}".lower()
# Look for GitHub repos (pattern: user/repo)
github_pattern = r'github\.com/([a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+)'
github_repos = re.findall(github_pattern, combined)
# Look for tool mentions (common dev tools)
tool_keywords = [
"docker", "kubernetes", "terraform", "ansible", "prometheus",
"grafana", "nginx", "redis", "postgres", "mongodb",
"react", "vue", "angular", "svelte", "nextjs",
"python", "rust", "go", "typescript", "javascript"
]
mentioned_tools = [t for t in tool_keywords if t in combined]
# Look for project mentions (capitalized words that might be projects)
# Simple heuristic: capitalized words in backticks or quotes
project_pattern = r'`([A-Z][a-zA-Z0-9_-]+)`|"([A-Z][a-zA-Z0-9_-]+)"'
projects = re.findall(project_pattern, title)
mentioned_projects = [p[0] or p[1] for p in projects if p[0] or p[1]]
return {
"github_repos": github_repos,
"tools": mentioned_tools,
"projects": mentioned_projects
}
def determine_relevance(self, title: str, text: str, subreddit: str) -> Optional[str]:
"""Determine which of my projects this content supports."""
combined = f"{title} {text} {subreddit}".lower()
for project, keywords in self.PROJECT_KEYWORDS.items():
if any(kw in combined for kw in keywords):
return project
return None
def process_saved_content(self, limit: int = 100) -> List[ProcessedContent]:
"""
Process saved posts to extract knowledge, not store posts.
For each saved post, we extract:
- External URLs (the actual resources)
- Named entities (projects, tools mentioned)
- Relevance to my active projects
We do NOT store:
- Reddit post content
- Reddit comments
- Vote counts
- User information
"""
processed_items = []
me = self.reddit.user.me()
for item in me.saved(limit=limit):
if isinstance(item, praw.models.Submission):
# Extract URLs from the post
urls = []
if item.url and not item.url.startswith('https://www.reddit.com'):
urls.append(item.url)
if item.selftext:
urls.extend(self.extract_urls(item.selftext))
# Extract entities
entities = self.extract_entities(item.selftext or "", item.title)
# Determine relevance
relevant_project = self.determine_relevance(
item.title, item.selftext or "", item.subreddit.display_name
)
# Create processed content (knowledge extracted, not post archived)
processed = ProcessedContent(
source_id=item.id, # Attribution only
source_url=f"https://reddit.com{item.permalink}", # Attribution only
external_urls=list(set(urls)), # The actual useful content
mentioned_projects=entities["projects"],
mentioned_tools=entities["tools"],
topics=[item.subreddit.display_name],
relevance_notes=f"Extracted from r/{item.subreddit.display_name}",
processed_at=datetime.now().isoformat(),
relevant_project=relevant_project
)
processed_items.append(processed)
return processed_items
def export_to_project_knowledge_base(self, processed_items: List[ProcessedContent]):
"""
Export extracted knowledge to project-specific files.
Organizes by project rather than by Reddit post.
Each project gets a JSON file with relevant URLs and references.
"""
# Group by project
by_project: Dict[str, List[Dict]] = {p: [] for p in self.MY_PROJECTS}
by_project["uncategorized"] = []
for item in processed_items:
project = item.relevant_project or "uncategorized"
entry = {
"urls": item.external_urls,
"tools": item.mentioned_tools,
"projects": item.mentioned_projects,
"topics": item.topics,
"source": item.source_url, # Attribution
"extracted_at": item.processed_at
}
by_project[project].append(entry)
# Export to project files
for project, entries in by_project.items():
if entries:
filename = f"project_kb/{project}_references.json"
print(f"Exporting {len(entries)} references to {filename}")
# In real implementation, would write to file
# For now, just print summary
all_urls = [url for e in entries for url in e["urls"]]
print(f" Total URLs: {len(all_urls)}")
print(f" Sample: {all_urls[:3] if all_urls else 'None'}")
def main():
"""Example usage."""
import os
# Load credentials from environment
client_id = os.environ.get('REDDIT_CLIENT_ID')
client_secret = os.environ.get('REDDIT_CLIENT_SECRET')
username = os.environ.get('REDDIT_USERNAME')
password = os.environ.get('REDDIT_PASSWORD')
if not all([client_id, client_secret, username, password]):
print("Error: Set REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USERNAME, REDDIT_PASSWORD")
return
# Initialize processor
processor = RedditContentProcessor(client_id, client_secret, username, password)
# Process saved content
print("Processing saved content to extract knowledge...")
processed = processor.process_saved_content(limit=50)
print(f"\nProcessed {len(processed)} items")
# Show what was extracted (not what was saved)
total_urls = sum(len(p.external_urls) for p in processed)
total_tools = sum(len(p.mentioned_tools) for p in processed)
print(f"Extracted:")
print(f" - {total_urls} external URLs")
print(f" - {total_tools} tool mentions")
print(f" - Organized by {len(processor.MY_PROJECTS)} active projects")
# Export to project knowledge bases
processor.export_to_project_knowledge_base(processed)
print("\n" + "="*70)
print("This script extracts knowledge from saved content.")
print("It does NOT archive Reddit posts.")
print("Output: Project-specific reference lists with URLs and tools.")
print("="*70)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment