rothnic · April 8, 2026 20:11
diff --git a/reddit_content_processor.py b/reddit_content_processor.py
 #!/usr/bin/env python3
 """
 Reddit Content Processor for Project Research

 This script processes saved Reddit content to extract knowledge for 
 active projects - not to archive Reddit data, but to extract useful
 references, URLs, and entities from content I've explicitly saved.

 Processing Pipeline:
 1. Fetch saved posts (content I've explicitly bookmarked)
 2. Extract URLs from post content
 3. Parse named entities (projects, tools, people mentioned)
 4. Categorize by topic/project relevance
 5. Add to project-specific knowledge graphs

 This is similar to:
 - Saving a research paper to Zotero and extracting citations
 - Bookmarking a tutorial and noting the tools used
 - Saving a discussion and extracting key recommendations

 NOT a data archive - a research workflow tool.
 """

 import praw
 import json
 import re
 from datetime import datetime
 from typing import List, Dict, Optional, Set
 from dataclasses import dataclass
 from urllib.parse import urlparse


 @dataclass
 class ProcessedContent:
    """Extracted knowledge from a Reddit post, not the post itself."""
    source_id: str  # Reddit post ID (for attribution only)
    source_url: str  # Permalink (for attribution only)
    
    # Extracted knowledge (the actual value)
    external_urls: List[str]  # URLs found in post content
    mentioned_projects: List[str]  # Project names mentioned
    mentioned_tools: List[str]  # Tools/software mentioned
    topics: List[str]  # Inferred topics/tags
    relevance_notes: str  # Why this is relevant to my projects
    
    # Metadata
    processed_at: str
    relevant_project: Optional[str]  # Which of my projects this supports


 class RedditContentProcessor:
    """
    Processes saved Reddit content to extract actionable knowledge.
    
    This is NOT an archive. We don't store Reddit posts.
    We extract URLs, references, and entities from posts I've saved,
    then integrate those into project knowledge bases.
    """
    
    # Projects I'm actively working on
    MY_PROJECTS = [
        "knowledge-management-system",
        "ai-agent-orchestration", 
        "family-automation",
        "llm-evaluation",
        "devops-tooling"
    ]
    
    # Keywords that indicate relevance to my projects
    PROJECT_KEYWORDS = {
        "knowledge-management-system": [
            "knowledge graph", "semantic web", "ontology", "rdf", "sparql",
            "note taking", "zettelkasten", "obsidian", "personal wiki"
        ],
        "ai-agent-orchestration": [
            "langchain", "autogen", "crewai", "agent framework",
            "llm orchestration", "multi-agent", "agent workflow"
        ],
        "family-automation": [
            "home automation", "smart home", "calendar sync", 
            "school tracker", "family organizer"
        ],
        "llm-evaluation": [
            "lm eval", "benchmark", "mmlu", "gsm8k", "human eval",
            "model evaluation", "llm metrics"
        ],
        "devops-tooling": [
            "kubernetes", "docker", "terraform", "ansible",
            "ci/cd", "github actions", "deployment"
        ]
    }
    
    def __init__(self, client_id: str, client_secret: str, username: str, password: str):
        """Initialize with Reddit API credentials."""
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent="ContentProcessor/1.0 (Research Workflow Tool)",
            username=username,
            password=password
        )
    
    def extract_urls(self, text: str) -> List[str]:
        """Extract all URLs from text content."""
        url_pattern = r'https?://[^\s<>"\']{3,}'
        urls = re.findall(url_pattern, text)
        # Clean and dedupe
        cleaned = []
        for url in urls:
            url = url.rstrip('.,;:!?)')
            if url not in cleaned:
                cleaned.append(url)
        return cleaned
    
    def extract_entities(self, text: str, title: str) -> Dict[str, List[str]]:
        """Extract named entities (projects, tools) from content."""
        combined = f"{title} {text}".lower()
        
        # Look for GitHub repos (pattern: user/repo)
        github_pattern = r'github\.com/([a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+)'
        github_repos = re.findall(github_pattern, combined)
        
        # Look for tool mentions (common dev tools)
        tool_keywords = [
            "docker", "kubernetes", "terraform", "ansible", "prometheus",
            "grafana", "nginx", "redis", "postgres", "mongodb",
            "react", "vue", "angular", "svelte", "nextjs",
            "python", "rust", "go", "typescript", "javascript"
        ]
        mentioned_tools = [t for t in tool_keywords if t in combined]
        
        # Look for project mentions (capitalized words that might be projects)
        # Simple heuristic: capitalized words in backticks or quotes
        project_pattern = r'`([A-Z][a-zA-Z0-9_-]+)`|"([A-Z][a-zA-Z0-9_-]+)"'
        projects = re.findall(project_pattern, title)
        mentioned_projects = [p[0] or p[1] for p in projects if p[0] or p[1]]
        
        return {
            "github_repos": github_repos,
            "tools": mentioned_tools,
            "projects": mentioned_projects
        }
    
    def determine_relevance(self, title: str, text: str, subreddit: str) -> Optional[str]:
        """Determine which of my projects this content supports."""
        combined = f"{title} {text} {subreddit}".lower()
        
        for project, keywords in self.PROJECT_KEYWORDS.items():
            if any(kw in combined for kw in keywords):
                return project
        
        return None
    
    def process_saved_content(self, limit: int = 100) -> List[ProcessedContent]:
        """
        Process saved posts to extract knowledge, not store posts.
        
        For each saved post, we extract:
        - External URLs (the actual resources)
        - Named entities (projects, tools mentioned)
        - Relevance to my active projects
        
        We do NOT store:
        - Reddit post content
        - Reddit comments
        - Vote counts
        - User information
        """
        processed_items = []
        me = self.reddit.user.me()
        
        for item in me.saved(limit=limit):
            if isinstance(item, praw.models.Submission):
                # Extract URLs from the post
                urls = []
                if item.url and not item.url.startswith('https://www.reddit.com'):
                    urls.append(item.url)
                if item.selftext:
                    urls.extend(self.extract_urls(item.selftext))
                
                # Extract entities
                entities = self.extract_entities(item.selftext or "", item.title)
                
                # Determine relevance
                relevant_project = self.determine_relevance(
                    item.title, item.selftext or "", item.subreddit.display_name
                )
                
                # Create processed content (knowledge extracted, not post archived)
                processed = ProcessedContent(
                    source_id=item.id,  # Attribution only
                    source_url=f"https://reddit.com{item.permalink}",  # Attribution only
                    external_urls=list(set(urls)),  # The actual useful content
                    mentioned_projects=entities["projects"],
                    mentioned_tools=entities["tools"],
                    topics=[item.subreddit.display_name],
                    relevance_notes=f"Extracted from r/{item.subreddit.display_name}",
                    processed_at=datetime.now().isoformat(),
                    relevant_project=relevant_project
                )
                
                processed_items.append(processed)
        
        return processed_items
    
    def export_to_project_knowledge_base(self, processed_items: List[ProcessedContent]):
        """
        Export extracted knowledge to project-specific files.
        
        Organizes by project rather than by Reddit post.
        Each project gets a JSON file with relevant URLs and references.
        """
        # Group by project
        by_project: Dict[str, List[Dict]] = {p: [] for p in self.MY_PROJECTS}
        by_project["uncategorized"] = []
        
        for item in processed_items:
            project = item.relevant_project or "uncategorized"
            
            entry = {
                "urls": item.external_urls,
                "tools": item.mentioned_tools,
                "projects": item.mentioned_projects,
                "topics": item.topics,
                "source": item.source_url,  # Attribution
                "extracted_at": item.processed_at
            }
            
            by_project[project].append(entry)
        
        # Export to project files
        for project, entries in by_project.items():
            if entries:
                filename = f"project_kb/{project}_references.json"
                print(f"Exporting {len(entries)} references to {filename}")
                # In real implementation, would write to file
                # For now, just print summary
                all_urls = [url for e in entries for url in e["urls"]]
                print(f"  Total URLs: {len(all_urls)}")
                print(f"  Sample: {all_urls[:3] if all_urls else 'None'}")


 def main():
    """Example usage."""
    import os
    
    # Load credentials from environment
    client_id = os.environ.get('REDDIT_CLIENT_ID')
    client_secret = os.environ.get('REDDIT_CLIENT_SECRET')
    username = os.environ.get('REDDIT_USERNAME')
    password = os.environ.get('REDDIT_PASSWORD')
    
    if not all([client_id, client_secret, username, password]):
        print("Error: Set REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USERNAME, REDDIT_PASSWORD")
        return
    
    # Initialize processor
    processor = RedditContentProcessor(client_id, client_secret, username, password)
    
    # Process saved content
    print("Processing saved content to extract knowledge...")
    processed = processor.process_saved_content(limit=50)
    
    print(f"\nProcessed {len(processed)} items")
    
    # Show what was extracted (not what was saved)
    total_urls = sum(len(p.external_urls) for p in processed)
    total_tools = sum(len(p.mentioned_tools) for p in processed)
    
    print(f"Extracted:")
    print(f"  - {total_urls} external URLs")
    print(f"  - {total_tools} tool mentions")
    print(f"  - Organized by {len(processor.MY_PROJECTS)} active projects")
    
    # Export to project knowledge bases
    processor.export_to_project_knowledge_base(processed)
    
    print("\n" + "="*70)
    print("This script extracts knowledge from saved content.")
    print("It does NOT archive Reddit posts.")
    print("Output: Project-specific reference lists with URLs and tools.")
    print("="*70)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Reddit Content Processor for Project Research

	This script processes saved Reddit content to extract knowledge for
	active projects - not to archive Reddit data, but to extract useful
	references, URLs, and entities from content I've explicitly saved.

	Processing Pipeline:
	1. Fetch saved posts (content I've explicitly bookmarked)
	2. Extract URLs from post content
	3. Parse named entities (projects, tools, people mentioned)
	4. Categorize by topic/project relevance
	5. Add to project-specific knowledge graphs

	This is similar to:
	- Saving a research paper to Zotero and extracting citations
	- Bookmarking a tutorial and noting the tools used
	- Saving a discussion and extracting key recommendations

	NOT a data archive - a research workflow tool.
	"""

	import praw
	import json
	import re
	from datetime import datetime
	from typing import List, Dict, Optional, Set
	from dataclasses import dataclass
	from urllib.parse import urlparse


	@dataclass
	class ProcessedContent:
	"""Extracted knowledge from a Reddit post, not the post itself."""
	source_id: str # Reddit post ID (for attribution only)
	source_url: str # Permalink (for attribution only)

	# Extracted knowledge (the actual value)
	external_urls: List[str] # URLs found in post content
	mentioned_projects: List[str] # Project names mentioned
	mentioned_tools: List[str] # Tools/software mentioned
	topics: List[str] # Inferred topics/tags
	relevance_notes: str # Why this is relevant to my projects

	# Metadata
	processed_at: str
	relevant_project: Optional[str] # Which of my projects this supports


	class RedditContentProcessor:
	"""
	Processes saved Reddit content to extract actionable knowledge.

	This is NOT an archive. We don't store Reddit posts.
	We extract URLs, references, and entities from posts I've saved,
	then integrate those into project knowledge bases.
	"""

	# Projects I'm actively working on
	MY_PROJECTS = [
	"knowledge-management-system",
	"ai-agent-orchestration",
	"family-automation",
	"llm-evaluation",
	"devops-tooling"
	]

	# Keywords that indicate relevance to my projects
	PROJECT_KEYWORDS = {
	"knowledge-management-system": [
	"knowledge graph", "semantic web", "ontology", "rdf", "sparql",
	"note taking", "zettelkasten", "obsidian", "personal wiki"
	],
	"ai-agent-orchestration": [
	"langchain", "autogen", "crewai", "agent framework",
	"llm orchestration", "multi-agent", "agent workflow"
	],
	"family-automation": [
	"home automation", "smart home", "calendar sync",
	"school tracker", "family organizer"
	],
	"llm-evaluation": [
	"lm eval", "benchmark", "mmlu", "gsm8k", "human eval",
	"model evaluation", "llm metrics"
	],
	"devops-tooling": [
	"kubernetes", "docker", "terraform", "ansible",
	"ci/cd", "github actions", "deployment"
	]
	}

	def __init__(self, client_id: str, client_secret: str, username: str, password: str):
	"""Initialize with Reddit API credentials."""
	self.reddit = praw.Reddit(
	client_id=client_id,
	client_secret=client_secret,
	user_agent="ContentProcessor/1.0 (Research Workflow Tool)",
	username=username,
	password=password
	)

	def extract_urls(self, text: str) -> List[str]:
	"""Extract all URLs from text content."""
	url_pattern = r'https?://[^\s<>"\']{3,}'
	urls = re.findall(url_pattern, text)
	# Clean and dedupe
	cleaned = []
	for url in urls:
	url = url.rstrip('.,;:!?)')
	if url not in cleaned:
	cleaned.append(url)
	return cleaned

	def extract_entities(self, text: str, title: str) -> Dict[str, List[str]]:
	"""Extract named entities (projects, tools) from content."""
	combined = f"{title} {text}".lower()

	# Look for GitHub repos (pattern: user/repo)
	github_pattern = r'github\.com/([a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+)'
	github_repos = re.findall(github_pattern, combined)

	# Look for tool mentions (common dev tools)
	tool_keywords = [
	"docker", "kubernetes", "terraform", "ansible", "prometheus",
	"grafana", "nginx", "redis", "postgres", "mongodb",
	"react", "vue", "angular", "svelte", "nextjs",
	"python", "rust", "go", "typescript", "javascript"
	]
	mentioned_tools = [t for t in tool_keywords if t in combined]

	# Look for project mentions (capitalized words that might be projects)
	# Simple heuristic: capitalized words in backticks or quotes
	project_pattern = r'`([A-Z][a-zA-Z0-9_-]+)`\|"([A-Z][a-zA-Z0-9_-]+)"'
	projects = re.findall(project_pattern, title)
	mentioned_projects = [p[0] or p[1] for p in projects if p[0] or p[1]]

	return {
	"github_repos": github_repos,
	"tools": mentioned_tools,
	"projects": mentioned_projects
	}

	def determine_relevance(self, title: str, text: str, subreddit: str) -> Optional[str]:
	"""Determine which of my projects this content supports."""
	combined = f"{title} {text} {subreddit}".lower()

	for project, keywords in self.PROJECT_KEYWORDS.items():
	if any(kw in combined for kw in keywords):
	return project

	return None

	def process_saved_content(self, limit: int = 100) -> List[ProcessedContent]:
	"""
	Process saved posts to extract knowledge, not store posts.

	For each saved post, we extract:
	- External URLs (the actual resources)
	- Named entities (projects, tools mentioned)
	- Relevance to my active projects

	We do NOT store:
	- Reddit post content
	- Reddit comments
	- Vote counts
	- User information
	"""
	processed_items = []
	me = self.reddit.user.me()

	for item in me.saved(limit=limit):
	if isinstance(item, praw.models.Submission):
	# Extract URLs from the post
	urls = []
	if item.url and not item.url.startswith('https://www.reddit.com'):
	urls.append(item.url)
	if item.selftext:
	urls.extend(self.extract_urls(item.selftext))

	# Extract entities
	entities = self.extract_entities(item.selftext or "", item.title)

	# Determine relevance
	relevant_project = self.determine_relevance(
	item.title, item.selftext or "", item.subreddit.display_name
	)

	# Create processed content (knowledge extracted, not post archived)
	processed = ProcessedContent(
	source_id=item.id, # Attribution only
	source_url=f"https://reddit.com{item.permalink}", # Attribution only
	external_urls=list(set(urls)), # The actual useful content
	mentioned_projects=entities["projects"],
	mentioned_tools=entities["tools"],
	topics=[item.subreddit.display_name],
	relevance_notes=f"Extracted from r/{item.subreddit.display_name}",
	processed_at=datetime.now().isoformat(),
	relevant_project=relevant_project
	)

	processed_items.append(processed)

	return processed_items

	def export_to_project_knowledge_base(self, processed_items: List[ProcessedContent]):
	"""
	Export extracted knowledge to project-specific files.

	Organizes by project rather than by Reddit post.
	Each project gets a JSON file with relevant URLs and references.
	"""
	# Group by project
	by_project: Dict[str, List[Dict]] = {p: [] for p in self.MY_PROJECTS}
	by_project["uncategorized"] = []

	for item in processed_items:
	project = item.relevant_project or "uncategorized"

	entry = {
	"urls": item.external_urls,
	"tools": item.mentioned_tools,
	"projects": item.mentioned_projects,
	"topics": item.topics,
	"source": item.source_url, # Attribution
	"extracted_at": item.processed_at
	}

	by_project[project].append(entry)

	# Export to project files
	for project, entries in by_project.items():
	if entries:
	filename = f"project_kb/{project}_references.json"
	print(f"Exporting {len(entries)} references to {filename}")
	# In real implementation, would write to file
	# For now, just print summary
	all_urls = [url for e in entries for url in e["urls"]]
	print(f" Total URLs: {len(all_urls)}")
	print(f" Sample: {all_urls[:3] if all_urls else 'None'}")


	def main():
	"""Example usage."""
	import os

	# Load credentials from environment
	client_id = os.environ.get('REDDIT_CLIENT_ID')
	client_secret = os.environ.get('REDDIT_CLIENT_SECRET')
	username = os.environ.get('REDDIT_USERNAME')
	password = os.environ.get('REDDIT_PASSWORD')

	if not all([client_id, client_secret, username, password]):
	print("Error: Set REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USERNAME, REDDIT_PASSWORD")
	return

	# Initialize processor
	processor = RedditContentProcessor(client_id, client_secret, username, password)

	# Process saved content
	print("Processing saved content to extract knowledge...")
	processed = processor.process_saved_content(limit=50)

	print(f"\nProcessed {len(processed)} items")

	# Show what was extracted (not what was saved)
	total_urls = sum(len(p.external_urls) for p in processed)
	total_tools = sum(len(p.mentioned_tools) for p in processed)

	print(f"Extracted:")
	print(f" - {total_urls} external URLs")
	print(f" - {total_tools} tool mentions")
	print(f" - Organized by {len(processor.MY_PROJECTS)} active projects")

	# Export to project knowledge bases
	processor.export_to_project_knowledge_base(processed)

	print("\n" + "="*70)
	print("This script extracts knowledge from saved content.")
	print("It does NOT archive Reddit posts.")
	print("Output: Project-specific reference lists with URLs and tools.")
	print("="*70)


	if __name__ == "__main__":
	main()
No results found