Created
April 8, 2026 20:11
-
-
Save rothnic/ecb981a1404b6b8ec8df3607bf87e15a to your computer and use it in GitHub Desktop.
Reddit Content Processor - Extracts knowledge from saved posts for project research (not data archival)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Reddit Content Processor for Project Research | |
| This script processes saved Reddit content to extract knowledge for | |
| active projects - not to archive Reddit data, but to extract useful | |
| references, URLs, and entities from content I've explicitly saved. | |
| Processing Pipeline: | |
| 1. Fetch saved posts (content I've explicitly bookmarked) | |
| 2. Extract URLs from post content | |
| 3. Parse named entities (projects, tools, people mentioned) | |
| 4. Categorize by topic/project relevance | |
| 5. Add to project-specific knowledge graphs | |
| This is similar to: | |
| - Saving a research paper to Zotero and extracting citations | |
| - Bookmarking a tutorial and noting the tools used | |
| - Saving a discussion and extracting key recommendations | |
| NOT a data archive - a research workflow tool. | |
| """ | |
| import praw | |
| import json | |
| import re | |
| from datetime import datetime | |
| from typing import List, Dict, Optional, Set | |
| from dataclasses import dataclass | |
| from urllib.parse import urlparse | |
| @dataclass | |
| class ProcessedContent: | |
| """Extracted knowledge from a Reddit post, not the post itself.""" | |
| source_id: str # Reddit post ID (for attribution only) | |
| source_url: str # Permalink (for attribution only) | |
| # Extracted knowledge (the actual value) | |
| external_urls: List[str] # URLs found in post content | |
| mentioned_projects: List[str] # Project names mentioned | |
| mentioned_tools: List[str] # Tools/software mentioned | |
| topics: List[str] # Inferred topics/tags | |
| relevance_notes: str # Why this is relevant to my projects | |
| # Metadata | |
| processed_at: str | |
| relevant_project: Optional[str] # Which of my projects this supports | |
| class RedditContentProcessor: | |
| """ | |
| Processes saved Reddit content to extract actionable knowledge. | |
| This is NOT an archive. We don't store Reddit posts. | |
| We extract URLs, references, and entities from posts I've saved, | |
| then integrate those into project knowledge bases. | |
| """ | |
| # Projects I'm actively working on | |
| MY_PROJECTS = [ | |
| "knowledge-management-system", | |
| "ai-agent-orchestration", | |
| "family-automation", | |
| "llm-evaluation", | |
| "devops-tooling" | |
| ] | |
| # Keywords that indicate relevance to my projects | |
| PROJECT_KEYWORDS = { | |
| "knowledge-management-system": [ | |
| "knowledge graph", "semantic web", "ontology", "rdf", "sparql", | |
| "note taking", "zettelkasten", "obsidian", "personal wiki" | |
| ], | |
| "ai-agent-orchestration": [ | |
| "langchain", "autogen", "crewai", "agent framework", | |
| "llm orchestration", "multi-agent", "agent workflow" | |
| ], | |
| "family-automation": [ | |
| "home automation", "smart home", "calendar sync", | |
| "school tracker", "family organizer" | |
| ], | |
| "llm-evaluation": [ | |
| "lm eval", "benchmark", "mmlu", "gsm8k", "human eval", | |
| "model evaluation", "llm metrics" | |
| ], | |
| "devops-tooling": [ | |
| "kubernetes", "docker", "terraform", "ansible", | |
| "ci/cd", "github actions", "deployment" | |
| ] | |
| } | |
| def __init__(self, client_id: str, client_secret: str, username: str, password: str): | |
| """Initialize with Reddit API credentials.""" | |
| self.reddit = praw.Reddit( | |
| client_id=client_id, | |
| client_secret=client_secret, | |
| user_agent="ContentProcessor/1.0 (Research Workflow Tool)", | |
| username=username, | |
| password=password | |
| ) | |
| def extract_urls(self, text: str) -> List[str]: | |
| """Extract all URLs from text content.""" | |
| url_pattern = r'https?://[^\s<>"\']{3,}' | |
| urls = re.findall(url_pattern, text) | |
| # Clean and dedupe | |
| cleaned = [] | |
| for url in urls: | |
| url = url.rstrip('.,;:!?)') | |
| if url not in cleaned: | |
| cleaned.append(url) | |
| return cleaned | |
| def extract_entities(self, text: str, title: str) -> Dict[str, List[str]]: | |
| """Extract named entities (projects, tools) from content.""" | |
| combined = f"{title} {text}".lower() | |
| # Look for GitHub repos (pattern: user/repo) | |
| github_pattern = r'github\.com/([a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+)' | |
| github_repos = re.findall(github_pattern, combined) | |
| # Look for tool mentions (common dev tools) | |
| tool_keywords = [ | |
| "docker", "kubernetes", "terraform", "ansible", "prometheus", | |
| "grafana", "nginx", "redis", "postgres", "mongodb", | |
| "react", "vue", "angular", "svelte", "nextjs", | |
| "python", "rust", "go", "typescript", "javascript" | |
| ] | |
| mentioned_tools = [t for t in tool_keywords if t in combined] | |
| # Look for project mentions (capitalized words that might be projects) | |
| # Simple heuristic: capitalized words in backticks or quotes | |
| project_pattern = r'`([A-Z][a-zA-Z0-9_-]+)`|"([A-Z][a-zA-Z0-9_-]+)"' | |
| projects = re.findall(project_pattern, title) | |
| mentioned_projects = [p[0] or p[1] for p in projects if p[0] or p[1]] | |
| return { | |
| "github_repos": github_repos, | |
| "tools": mentioned_tools, | |
| "projects": mentioned_projects | |
| } | |
| def determine_relevance(self, title: str, text: str, subreddit: str) -> Optional[str]: | |
| """Determine which of my projects this content supports.""" | |
| combined = f"{title} {text} {subreddit}".lower() | |
| for project, keywords in self.PROJECT_KEYWORDS.items(): | |
| if any(kw in combined for kw in keywords): | |
| return project | |
| return None | |
| def process_saved_content(self, limit: int = 100) -> List[ProcessedContent]: | |
| """ | |
| Process saved posts to extract knowledge, not store posts. | |
| For each saved post, we extract: | |
| - External URLs (the actual resources) | |
| - Named entities (projects, tools mentioned) | |
| - Relevance to my active projects | |
| We do NOT store: | |
| - Reddit post content | |
| - Reddit comments | |
| - Vote counts | |
| - User information | |
| """ | |
| processed_items = [] | |
| me = self.reddit.user.me() | |
| for item in me.saved(limit=limit): | |
| if isinstance(item, praw.models.Submission): | |
| # Extract URLs from the post | |
| urls = [] | |
| if item.url and not item.url.startswith('https://www.reddit.com'): | |
| urls.append(item.url) | |
| if item.selftext: | |
| urls.extend(self.extract_urls(item.selftext)) | |
| # Extract entities | |
| entities = self.extract_entities(item.selftext or "", item.title) | |
| # Determine relevance | |
| relevant_project = self.determine_relevance( | |
| item.title, item.selftext or "", item.subreddit.display_name | |
| ) | |
| # Create processed content (knowledge extracted, not post archived) | |
| processed = ProcessedContent( | |
| source_id=item.id, # Attribution only | |
| source_url=f"https://reddit.com{item.permalink}", # Attribution only | |
| external_urls=list(set(urls)), # The actual useful content | |
| mentioned_projects=entities["projects"], | |
| mentioned_tools=entities["tools"], | |
| topics=[item.subreddit.display_name], | |
| relevance_notes=f"Extracted from r/{item.subreddit.display_name}", | |
| processed_at=datetime.now().isoformat(), | |
| relevant_project=relevant_project | |
| ) | |
| processed_items.append(processed) | |
| return processed_items | |
| def export_to_project_knowledge_base(self, processed_items: List[ProcessedContent]): | |
| """ | |
| Export extracted knowledge to project-specific files. | |
| Organizes by project rather than by Reddit post. | |
| Each project gets a JSON file with relevant URLs and references. | |
| """ | |
| # Group by project | |
| by_project: Dict[str, List[Dict]] = {p: [] for p in self.MY_PROJECTS} | |
| by_project["uncategorized"] = [] | |
| for item in processed_items: | |
| project = item.relevant_project or "uncategorized" | |
| entry = { | |
| "urls": item.external_urls, | |
| "tools": item.mentioned_tools, | |
| "projects": item.mentioned_projects, | |
| "topics": item.topics, | |
| "source": item.source_url, # Attribution | |
| "extracted_at": item.processed_at | |
| } | |
| by_project[project].append(entry) | |
| # Export to project files | |
| for project, entries in by_project.items(): | |
| if entries: | |
| filename = f"project_kb/{project}_references.json" | |
| print(f"Exporting {len(entries)} references to {filename}") | |
| # In real implementation, would write to file | |
| # For now, just print summary | |
| all_urls = [url for e in entries for url in e["urls"]] | |
| print(f" Total URLs: {len(all_urls)}") | |
| print(f" Sample: {all_urls[:3] if all_urls else 'None'}") | |
| def main(): | |
| """Example usage.""" | |
| import os | |
| # Load credentials from environment | |
| client_id = os.environ.get('REDDIT_CLIENT_ID') | |
| client_secret = os.environ.get('REDDIT_CLIENT_SECRET') | |
| username = os.environ.get('REDDIT_USERNAME') | |
| password = os.environ.get('REDDIT_PASSWORD') | |
| if not all([client_id, client_secret, username, password]): | |
| print("Error: Set REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USERNAME, REDDIT_PASSWORD") | |
| return | |
| # Initialize processor | |
| processor = RedditContentProcessor(client_id, client_secret, username, password) | |
| # Process saved content | |
| print("Processing saved content to extract knowledge...") | |
| processed = processor.process_saved_content(limit=50) | |
| print(f"\nProcessed {len(processed)} items") | |
| # Show what was extracted (not what was saved) | |
| total_urls = sum(len(p.external_urls) for p in processed) | |
| total_tools = sum(len(p.mentioned_tools) for p in processed) | |
| print(f"Extracted:") | |
| print(f" - {total_urls} external URLs") | |
| print(f" - {total_tools} tool mentions") | |
| print(f" - Organized by {len(processor.MY_PROJECTS)} active projects") | |
| # Export to project knowledge bases | |
| processor.export_to_project_knowledge_base(processed) | |
| print("\n" + "="*70) | |
| print("This script extracts knowledge from saved content.") | |
| print("It does NOT archive Reddit posts.") | |
| print("Output: Project-specific reference lists with URLs and tools.") | |
| print("="*70) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment