Last active
March 16, 2026 19:16
-
-
Save nahamsec/6f2aac6288568c9d4e78d1bd216e861e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| HackerOne AI/LLM Timeline Scanner | |
| =================================== | |
| Uses the official HackerOne Hacker API to find when public bug bounty programs | |
| added AI / Chatbot / LLM to their scope. Produces a month-by-month trend. | |
| Created by @NahamSec using Claude AI | |
| Youtube.com/NahamSec | |
| Checks BOTH: | |
| - Structured scopes (target list) — with created_at dates | |
| - Policy text (free-form description) — catches programs like HubSpot | |
| Usage: | |
| pip install requests | |
| python bb_ai_changelog_scraper.py --username nahamsec --token YOUR_TOKEN | |
| # Or set as environment variables | |
| export H1_USERNAME=nahamsec | |
| export H1_TOKEN=your_token | |
| python bb_ai_changelog_scraper.py | |
| # Test with a few programs first | |
| python bb_ai_changelog_scraper.py --max-programs 20 | |
| # Fast mode: also pull from bounty-targets-data for comparison | |
| python bb_ai_changelog_scraper.py --include-bounty-targets | |
| Output: | |
| - ai_bounty_programs.csv (one row per AI scope item, with dates) | |
| - ai_bounty_programs.json (full structured data) | |
| - ai_timeline.csv (month-by-month trend data) | |
| - Console: program list + trend chart | |
| """ | |
| import requests | |
| import json | |
| import re | |
| import time | |
| import csv | |
| import sys | |
| import os | |
| import argparse | |
| from datetime import datetime | |
| from collections import defaultdict, OrderedDict | |
| # ============================================================================= | |
| # AI KEYWORD DETECTION | |
| # ============================================================================= | |
| # --- SCOPE ITEM PATTERNS --- | |
| # Applied to: asset_identifier, asset_type, instruction text | |
| # These indicate a real AI feature/product in the program's target list. | |
| SCOPE_KEYWORD_PATTERNS = [ | |
| # AI product/feature labels | |
| r'\bai[\s/]+ml\s+feature', # AI/ML Features | |
| r'\bai\s+feature', # AI Features | |
| r'\bml\s+feature', # ML Features | |
| r'\bai\s+scope\b', # AI Scope | |
| r'\bai[\s-]?powered\b', # AI-Powered / AI Powered | |
| r'\bgenai\b', # GenAI | |
| r'\bgenerative\s+ai\b', # Generative AI | |
| # AI product types | |
| r'\bai\s*chatbot\b', # AI ChatBot | |
| r'\bchatbot\b', # Chatbot | |
| r'\bchat[\s-]?bot\b', # Chat Bot / Chat-Bot | |
| r'\bcopilot\b', # Copilot | |
| r'\bco-pilot\b', # Co-pilot | |
| r'\bai\s+assistant\b', # AI Assistant | |
| r'\bai\s+agent\b', # AI Agent | |
| # LLM references | |
| r'\bllm\b', # LLM | |
| r'\blarge\s+language\s+model\b', # Large Language Model | |
| # Specific AI products | |
| r'\bchatgpt\b', # ChatGPT | |
| r'\bopenai\b', # OpenAI | |
| r'\bgpt[-\s]?\d\b', # GPT-4, GPT 4 | |
| # AI security | |
| r'\bprompt\s+injection\b', # Prompt Injection | |
| ] | |
| SCOPE_PATTERN = re.compile('|'.join(SCOPE_KEYWORD_PATTERNS), re.IGNORECASE) | |
| # --- DOMAIN PATTERNS --- | |
| # Applied to: asset_identifier (URLs/domains) | |
| # Catches domains like company.ai, ai.company.com, chat-ai.company.com | |
| DOMAIN_AI_PATTERNS = [ | |
| r'\.ai$', # ends in .ai (e.g. company.ai) | |
| r'\.ai[:/]', # .ai followed by : or / (e.g. company.ai/path) | |
| r'[./]ai\.', # .ai. or /ai. subdomain (e.g. ai.company.com) | |
| r'[-.]ai[-.]', # -ai- or -ai. in domain (e.g. chat-ai.company.com) | |
| ] | |
| DOMAIN_PATTERN = re.compile('|'.join(DOMAIN_AI_PATTERNS), re.IGNORECASE) | |
| # --- POLICY TEXT PATTERNS --- | |
| # Applied to: free-form policy/description text | |
| # Same core keywords but used with false-positive filtering | |
| POLICY_KEYWORD_PATTERNS = [ | |
| r'\bai[\s/]+ml\s+feature', # AI/ML Features | |
| r'\bai\s+feature', # AI Features | |
| r'\bml\s+feature', # ML Features | |
| r'\bai\s+scope\b', # AI Scope | |
| r'\bai[\s-]?powered\b', # AI-Powered | |
| r'\bgenai\b', # GenAI | |
| r'\bgenerative\s+ai\b', # Generative AI | |
| r'\bai\s*chatbot\b', # AI ChatBot | |
| r'\bchatbot\b', # Chatbot | |
| r'\bchat[\s-]?bot\b', # Chat Bot | |
| r'\bcopilot\b', # Copilot | |
| r'\bco-pilot\b', # Co-pilot | |
| r'\bai\s+assistant\b', # AI Assistant | |
| r'\bai\s+agent\b', # AI Agent | |
| r'\bllm\b', # LLM | |
| r'\blarge\s+language\s+model\b', # Large Language Model | |
| r'\bprompt\s+injection\b', # Prompt Injection | |
| ] | |
| POLICY_PATTERN = re.compile('|'.join(POLICY_KEYWORD_PATTERNS), re.IGNORECASE) | |
| # --- FALSE POSITIVE BLOCKLIST --- | |
| # Phrases where AI keywords appear in context of reporting rules, tool usage | |
| # policies, or quality warnings — NOT about AI features being in scope. | |
| FALSE_POSITIVE_PHRASES = [ | |
| # Report quality / AI slop warnings | |
| r'ai\s+slop', | |
| r'ai\s+generated\s+report', | |
| r'auto[\s-]?generated', | |
| r'generated\s+(largely\s+)?by\s+(llm|ai|chatgpt)', | |
| r'ai\s+hallucinate', | |
| r'hallucinated', | |
| # Policy warnings about using AI tools for reporting | |
| r'don.t\s+leak.*ai\s+service', | |
| r'leak.*to\s+(any\s+)?(saas|ai\s+service|chatgpt|llm)', | |
| r'(do\s+not|don.t|never)\s+(use|share|upload|submit|leak).*\b(ai|chatgpt|llm|gpt)\b', | |
| r'ai\s+services?\s+like\s+chatgpt', | |
| r'services?\s+such\s+as\s+(large\s+language\s+model|ai|llm|chatgpt)', | |
| r'(run|use)\s+(locally|on\s+your\s+own)', | |
| r'browser\s+plugin.*translation', | |
| r'large\s+language\s+model.{0,30}(leak|local|own\s+hardware)', | |
| # Report generation disclaimers | |
| r'(report|submission).{0,40}(generated|written|created)\s+(by|using|with)\s+(ai|llm|chatgpt|gpt)', | |
| r'(ai|llm|chatgpt).{0,30}(generated|written|created)\s+(report|submission)', | |
| r'without\s+careful\s+review', | |
| r'additional\s+work\s+on\s+our\s+side', | |
| r'invalid\s+report', | |
| # General tool usage disclaimers | |
| r'(use|using)\s+(of\s+)?(ai|llm).{0,30}(prohibited|not\s+allowed|forbidden|banned)', | |
| r'(ai|llm).{0,20}(tool|service|platform).{0,30}(prohibited|not\s+allowed|forbidden|banned)', | |
| ] | |
| FALSE_POSITIVE_PATTERN = re.compile('|'.join(FALSE_POSITIVE_PHRASES), re.IGNORECASE) | |
| def check_scope_keywords(text): | |
| """ | |
| Check scope item text (asset_identifier + asset_type + instruction) | |
| for AI/LLM keywords. Returns matched keyword strings. | |
| """ | |
| if not text: | |
| return [] | |
| matches = set() | |
| for m in SCOPE_PATTERN.finditer(text): | |
| matches.add(m.group().strip().lower()) | |
| return sorted(matches) | |
| def check_domain_ai(domain): | |
| """ | |
| Check if a domain/URL is AI-related (ends in .ai, has ai. subdomain, etc.) | |
| Returns the matched pattern or None. | |
| """ | |
| if not domain: | |
| return None | |
| m = DOMAIN_PATTERN.search(domain) | |
| if m: | |
| return m.group().strip() | |
| return None | |
| def check_policy_keywords(text): | |
| """ | |
| Check policy text for AI/LLM keywords. Returns matched keyword strings. | |
| """ | |
| if not text: | |
| return [] | |
| matches = set() | |
| for m in POLICY_PATTERN.finditer(text): | |
| matches.add(m.group().strip().lower()) | |
| return sorted(matches) | |
| def is_false_positive_context(text): | |
| """ | |
| Check if text is a false positive — mentions AI but in context of | |
| reporting rules, tool usage policies, or quality warnings, NOT about | |
| AI features actually being in scope. | |
| """ | |
| if not text: | |
| return False | |
| return bool(FALSE_POSITIVE_PATTERN.search(text)) | |
| # ============================================================================= | |
| # HACKERONE API CLIENT | |
| # ============================================================================= | |
| class HackerOneAPI: | |
| """Official HackerOne Hacker API v1 client.""" | |
| BASE_URL = "https://api.hackerone.com/v1/hackers" | |
| def __init__(self, username, token, delay=0.1): | |
| self.auth = (username, token) | |
| self.delay = delay | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'Accept': 'application/json', | |
| }) | |
| self.request_count = 0 | |
| def _get(self, endpoint, params=None): | |
| """Make authenticated GET request with rate limiting.""" | |
| time.sleep(self.delay) | |
| self.request_count += 1 | |
| url = f"{self.BASE_URL}{endpoint}" | |
| try: | |
| resp = self.session.get(url, auth=self.auth, params=params, timeout=30) | |
| if resp.status_code == 429: | |
| wait = int(resp.headers.get('Retry-After', 60)) | |
| print(f"\n [Rate limited] Waiting {wait}s...") | |
| time.sleep(wait) | |
| return self._get(endpoint, params) | |
| if resp.status_code == 401: | |
| print(f"\n [!] Authentication failed. Check your username and token.") | |
| sys.exit(1) | |
| if resp.status_code != 200: | |
| return None | |
| return resp.json() | |
| except requests.exceptions.RequestException as e: | |
| print(f"\n [Error] {e}") | |
| return None | |
| def _get_paginated(self, endpoint, params=None): | |
| """Fetch all pages from a paginated endpoint.""" | |
| all_data = [] | |
| url = f"{self.BASE_URL}{endpoint}" | |
| page_params = params or {} | |
| while url: | |
| time.sleep(self.delay) | |
| self.request_count += 1 | |
| try: | |
| resp = self.session.get(url, auth=self.auth, params=page_params, timeout=30) | |
| if resp.status_code == 429: | |
| wait = int(resp.headers.get('Retry-After', 60)) | |
| print(f"\n [Rate limited] Waiting {wait}s...") | |
| time.sleep(wait) | |
| continue | |
| if resp.status_code != 200: | |
| break | |
| data = resp.json() | |
| items = data.get('data', []) | |
| all_data.extend(items) | |
| # Follow pagination | |
| next_url = data.get('links', {}).get('next') | |
| if next_url: | |
| url = next_url | |
| page_params = {} # URL already has params | |
| else: | |
| break | |
| except requests.exceptions.RequestException: | |
| break | |
| return all_data | |
| def get_programs(self): | |
| """Fetch all public bounty programs with policy text.""" | |
| print(" Fetching programs via API...", flush=True) | |
| programs = self._get_paginated('/programs', params={ | |
| 'page[size]': 100, | |
| }) | |
| print(f" Got {len(programs)} programs", flush=True) | |
| return programs | |
| def get_structured_scopes(self, handle): | |
| """Fetch structured scopes for a program (includes created_at dates).""" | |
| scopes = self._get_paginated(f'/programs/{handle}/structured_scopes', params={ | |
| 'page[size]': 100, | |
| }) | |
| return scopes | |
| # ============================================================================= | |
| # BOUNTY TARGETS DATA (optional supplement) | |
| # ============================================================================= | |
| BOUNTY_TARGETS_URL = "https://raw.githubusercontent.com/arkadiyt/bounty-targets-data/main/data/hackerone_data.json" | |
| def fetch_bounty_targets(): | |
| """Fetch HackerOne data from bounty-targets-data repo.""" | |
| print(" Downloading bounty-targets-data...") | |
| try: | |
| resp = requests.get(BOUNTY_TARGETS_URL, timeout=120) | |
| if resp.status_code == 200: | |
| data = resp.json() | |
| print(f" Got {len(data)} programs from bounty-targets-data") | |
| return data | |
| except Exception as e: | |
| print(f" [!] Failed: {e}") | |
| return None | |
| # ============================================================================= | |
| # SCANNER | |
| # ============================================================================= | |
| def scan_programs(api, max_programs=None, include_bounty_targets=False): | |
| """ | |
| Scan all HackerOne programs for AI/LLM in scope and policy. | |
| Returns list of programs with AI findings and dates. | |
| """ | |
| results = [] | |
| # Phase 1: Get all programs via API | |
| programs = api.get_programs() | |
| if not programs: | |
| print(" [!] No programs returned from API") | |
| return results | |
| if max_programs: | |
| programs = programs[:max_programs] | |
| # Phase 1b: Optionally supplement with bounty-targets-data | |
| bt_data = {} | |
| if include_bounty_targets: | |
| bt_raw = fetch_bounty_targets() | |
| if bt_raw: | |
| for p in bt_raw: | |
| handle = p.get('handle', '') | |
| if handle: | |
| bt_data[handle] = p | |
| total = len(programs) | |
| print(f"\n Scanning {total} programs for AI/LLM keywords...", flush=True) | |
| print(f" (checking policy text + structured scopes with dates)\n", flush=True) | |
| private_skipped = 0 | |
| for i, program_data in enumerate(programs): | |
| attrs = program_data.get('attributes', {}) | |
| handle = attrs.get('handle', '') | |
| name = attrs.get('name', handle) | |
| policy = attrs.get('policy', '') or '' | |
| state = attrs.get('state', '') | |
| offers_bounties = attrs.get('offers_bounties', False) | |
| submission_state = attrs.get('submission_state', '') | |
| if not handle: | |
| continue | |
| # Determine if program is public or private | |
| # HackerOne: "soft_launched" = private/invite-only, anything else = public | |
| is_public = state != 'soft_launched' | |
| # Progress | |
| if (i + 1) % 50 == 0 or i == 0: | |
| progress_name = handle if is_public else '[REDACTED]' | |
| print(f" [{i+1}/{total}] {progress_name} ({'public' if is_public else 'private'})...", flush=True) | |
| ai_findings = { | |
| 'program': name, | |
| 'handle': handle, | |
| 'url': f"https://hackerone.com/{handle}", | |
| 'offers_bounties': offers_bounties, | |
| 'state': state, | |
| 'visibility': 'public' if is_public else 'private', | |
| 'policy_matches': [], | |
| 'scope_matches': [], | |
| 'earliest_ai_date': None, | |
| } | |
| has_ai = False | |
| # --- Check policy text (fast — already in memory) --- | |
| policy_matches = check_policy_keywords(policy) | |
| if policy_matches: | |
| # Extract snippets, then filter out false positives | |
| snippets = [] | |
| real_matches = [] | |
| for kw in policy_matches: | |
| kw_pattern = re.compile(re.escape(kw), re.IGNORECASE) | |
| for m in kw_pattern.finditer(policy): | |
| start = max(0, m.start() - 80) | |
| end = min(len(policy), m.end() + 80) | |
| snippet = policy[start:end].replace('\n', ' ').strip() | |
| if is_false_positive_context(snippet): | |
| continue # Skip — reporting rules, not AI in scope | |
| if snippet not in [s.strip('.') for s in snippets]: | |
| snippets.append(f"...{snippet}...") | |
| real_matches.append(kw) | |
| break | |
| if real_matches: | |
| has_ai = True | |
| ai_findings['policy_matches'] = sorted(set(real_matches)) | |
| ai_findings['policy_snippets'] = snippets[:5] | |
| # --- Check structured scopes (API call — with created_at dates) --- | |
| scopes = api.get_structured_scopes(handle) | |
| for scope_data in scopes: | |
| scope_attrs = scope_data.get('attributes', {}) | |
| asset_id = scope_attrs.get('asset_identifier', '') or '' | |
| asset_type = scope_attrs.get('asset_type', '') or '' | |
| instruction = scope_attrs.get('instruction', '') or '' | |
| eligible = scope_attrs.get('eligible_for_bounty', False) | |
| created_at = scope_attrs.get('created_at', '') or '' | |
| updated_at = scope_attrs.get('updated_at', '') or '' | |
| # Check 1: keyword match in asset_id + asset_type + instruction | |
| search_text = f"{asset_id} {asset_type} {instruction}" | |
| kw_matches = check_scope_keywords(search_text) | |
| # Check 2: domain pattern match (.ai TLD, ai. subdomain, etc.) | |
| domain_match = check_domain_ai(asset_id) | |
| matches = list(kw_matches) | |
| if domain_match: | |
| matches.append(f".ai domain ({domain_match})") | |
| if matches: | |
| # Filter false positives in instruction text | |
| if is_false_positive_context(instruction): | |
| continue | |
| has_ai = True | |
| scope_date = created_at[:10] if created_at else '' | |
| ai_findings['scope_matches'].append({ | |
| 'keywords': matches, | |
| 'asset_identifier': asset_id, | |
| 'asset_type': asset_type, | |
| 'instruction': instruction[:300], | |
| 'eligible_for_bounty': eligible, | |
| 'created_at': scope_date, | |
| 'updated_at': (updated_at[:10] if updated_at else ''), | |
| }) | |
| # Track earliest AI scope date | |
| if scope_date: | |
| if not ai_findings['earliest_ai_date'] or scope_date < ai_findings['earliest_ai_date']: | |
| ai_findings['earliest_ai_date'] = scope_date | |
| # --- Also check bounty-targets-data if available --- | |
| if not has_ai and handle in bt_data: | |
| bt_program = bt_data[handle] | |
| targets = bt_program.get('targets', {}) or {} | |
| for target in (targets.get('in_scope', []) or []): | |
| asset_id = target.get('asset_identifier', '') or '' | |
| asset_type = target.get('asset_type', '') or '' | |
| instruction = target.get('instruction', '') or '' | |
| search_text = f"{asset_id} {asset_type} {instruction}" | |
| kw_matches = check_scope_keywords(search_text) | |
| domain_match = check_domain_ai(asset_id) | |
| matches = list(kw_matches) | |
| if domain_match: | |
| matches.append(f".ai domain ({domain_match})") | |
| if matches: | |
| has_ai = True | |
| ai_findings['scope_matches'].append({ | |
| 'keywords': matches, | |
| 'asset_identifier': asset_id, | |
| 'asset_type': asset_type, | |
| 'instruction': instruction[:300], | |
| 'eligible_for_bounty': target.get('eligible_for_bounty', False), | |
| 'created_at': '', | |
| 'updated_at': '', | |
| 'source': 'bounty-targets-data', | |
| }) | |
| if has_ai: | |
| results.append(ai_findings) | |
| all_kw = set(ai_findings['policy_matches']) | |
| for s in ai_findings['scope_matches']: | |
| all_kw.update(s['keywords']) | |
| sources = [] | |
| if ai_findings['policy_matches']: | |
| sources.append('policy') | |
| if ai_findings['scope_matches']: | |
| sources.append('scope') | |
| date_str = ai_findings['earliest_ai_date'] or 'no date' | |
| vis = ai_findings['visibility'].upper() | |
| display_name = '[REDACTED]' if not is_public else name | |
| print(f" ✓ [{vis}] {display_name} [{', '.join(sources)}] " | |
| f"keywords: {', '.join(sorted(all_kw))} | date: {date_str}", flush=True) | |
| # Summary | |
| public_count = len([r for r in results if r['visibility'] == 'public']) | |
| private_count = len([r for r in results if r['visibility'] == 'private']) | |
| print(f"\n Summary: {len(results)} programs with AI/LLM " | |
| f"({public_count} public, {private_count} private)") | |
| return results | |
| # ============================================================================= | |
| # OUTPUT: TREND TIMELINE | |
| # ============================================================================= | |
| def print_trend_timeline(results): | |
| """Print month-by-month trend chart of AI adoption.""" | |
| dated = [(r['program'] if r.get('visibility') == 'public' else '[REDACTED]', | |
| r['earliest_ai_date']) | |
| for r in results if r.get('earliest_ai_date')] | |
| undated = [r for r in results if not r.get('earliest_ai_date')] | |
| if not dated: | |
| print("\n No date data available for timeline.") | |
| return | |
| monthly = defaultdict(list) | |
| for name, date_str in dated: | |
| month_key = date_str[:7] # "2024-03" | |
| monthly[month_key].append(name) | |
| sorted_months = sorted(monthly.keys()) | |
| cumulative = 0 | |
| max_new = max(len(v) for v in monthly.values()) | |
| print("\n" + "=" * 80) | |
| print(" AI/LLM ADOPTION TIMELINE — HACKERONE BUG BOUNTY PROGRAMS") | |
| print("=" * 80) | |
| print(f"\n {'Month':<10} {'New':>4} {'Total':>6} {'Bar':<30} Programs") | |
| print(f" {'─' * 76}") | |
| for month in sorted_months: | |
| programs = monthly[month] | |
| cumulative += len(programs) | |
| bar_len = int((len(programs) / max(max_new, 1)) * 30) | |
| bar = '█' * bar_len | |
| names = ', '.join(programs[:3]) | |
| if len(programs) > 3: | |
| names += f", +{len(programs) - 3} more" | |
| print(f" {month:<10} {len(programs):>4} {cumulative:>6} {bar:<30} {names}") | |
| print(f"\n {'─' * 76}") | |
| print(f" {'TOTAL':<10} {cumulative:>4}") | |
| if undated: | |
| print(f"\n + {len(undated)} programs with AI in scope but no scope creation date") | |
| for r in undated[:10]: | |
| dn = r['program'] if r.get('visibility') == 'public' else '[REDACTED]' | |
| print(f" - {dn} (found in: {'policy' if r['policy_matches'] else 'scope'})") | |
| if len(undated) > 10: | |
| print(f" ... and {len(undated) - 10} more") | |
| def print_program_list(results): | |
| """Print full list of programs with AI in scope.""" | |
| print("\n" + "=" * 80) | |
| print(" ALL HACKERONE PROGRAMS WITH AI/CHATBOT/LLM") | |
| print("=" * 80) | |
| if not results: | |
| print("\n No programs found.\n") | |
| return | |
| print(f"\n Total: {len(results)} programs\n") | |
| # Sort by date (dated first, then undated) | |
| sorted_results = sorted(results, | |
| key=lambda x: (x.get('earliest_ai_date') or 'zzzz', x['program'])) | |
| for r in sorted_results: | |
| all_kw = set(r.get('policy_matches', [])) | |
| for s in r.get('scope_matches', []): | |
| all_kw.update(s.get('keywords', [])) | |
| vis_tag = f"[{r.get('visibility', 'unknown').upper()}]" | |
| is_private = r.get('visibility', '') != 'public' | |
| display_name = '[REDACTED]' if is_private else r['program'] | |
| print(f" {'─' * 70}") | |
| print(f" {vis_tag} {display_name}") | |
| if not is_private: | |
| print(f" https://hackerone.com/{r['handle']}") | |
| if r.get('earliest_ai_date'): | |
| print(f" AI scope added: {r['earliest_ai_date']}") | |
| print(f" Keywords: {', '.join(sorted(all_kw))}") | |
| if r.get('policy_matches'): | |
| print(f" Found in policy: {', '.join(r['policy_matches'])}") | |
| for snippet in r.get('policy_snippets', [])[:2]: | |
| print(f" \"{snippet}\"") | |
| if r.get('scope_matches'): | |
| print(f" Scope items ({len(r['scope_matches'])}):") | |
| for s in r['scope_matches'][:5]: | |
| bounty = ' [BOUNTY]' if s.get('eligible_for_bounty') else '' | |
| date = f" (added: {s['created_at']})" if s.get('created_at') else '' | |
| print(f" • {s['asset_identifier']} ({s['asset_type']}){bounty}{date}") | |
| if s.get('instruction'): | |
| print(f" {s['instruction'][:100]}") | |
| print() | |
| def save_csv(results, filename): | |
| """Save results to CSV — one row per keyword match, separate lines for policy vs scope.""" | |
| with open(filename, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerow([ | |
| 'Program Name', 'Handle', 'Visibility', 'Date AI Was Introduced', | |
| 'Keyword Matched', 'Where (Policy or Scope)' | |
| ]) | |
| for r in results: | |
| # Write one row per policy keyword match | |
| for kw in r.get('policy_matches', []): | |
| writer.writerow([ | |
| r['program'], | |
| r['handle'], | |
| r.get('visibility', 'unknown'), | |
| r.get('earliest_ai_date', 'Unknown'), | |
| kw, | |
| 'Policy', | |
| ]) | |
| # Write one row per scope keyword match | |
| for s in r.get('scope_matches', []): | |
| scope_date = s.get('created_at', '') or r.get('earliest_ai_date', '') or 'Unknown' | |
| for kw in s.get('keywords', []): | |
| writer.writerow([ | |
| r['program'], | |
| r['handle'], | |
| r.get('visibility', 'unknown'), | |
| scope_date, | |
| kw, | |
| 'Scope', | |
| ]) | |
| print(f" Programs CSV: {filename}") | |
| def save_timeline_csv(results, filename): | |
| """Save month-by-month timeline to CSV.""" | |
| dated = [(r['program'], r['earliest_ai_date']) | |
| for r in results if r.get('earliest_ai_date')] | |
| monthly = defaultdict(list) | |
| for name, date_str in dated: | |
| monthly[date_str[:7]].append(name) | |
| with open(filename, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerow(['Month', 'New Programs', 'Cumulative Total', 'Programs Added']) | |
| cumulative = 0 | |
| for month in sorted(monthly.keys()): | |
| programs = monthly[month] | |
| cumulative += len(programs) | |
| writer.writerow([month, len(programs), cumulative, '; '.join(programs)]) | |
| print(f" Timeline CSV: {filename}") | |
| def save_json(results, filename): | |
| """Save full results to JSON.""" | |
| dated = [(r['program'], r['earliest_ai_date']) | |
| for r in results if r.get('earliest_ai_date')] | |
| monthly = defaultdict(list) | |
| for name, date_str in dated: | |
| monthly[date_str[:7]].append(name) | |
| output = { | |
| 'scan_date': datetime.now().isoformat(), | |
| 'platform': 'HackerOne', | |
| 'summary': { | |
| 'total_programs_with_ai': len(results), | |
| 'programs_with_dates': len(dated), | |
| 'programs_without_dates': len(results) - len(dated), | |
| }, | |
| 'timeline': {month: { | |
| 'count': len(programs), | |
| 'programs': programs, | |
| } for month, programs in sorted(monthly.items())}, | |
| 'programs': results, | |
| } | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| json.dump(output, f, indent=2, ensure_ascii=False, default=str) | |
| print(f" Full JSON: {filename}") | |
| # ============================================================================= | |
| # MAIN | |
| # ============================================================================= | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Find when HackerOne programs added AI/LLM to scope (uses official API)' | |
| ) | |
| parser.add_argument('--username', type=str, | |
| default=os.environ.get('H1_USERNAME', ''), | |
| help='HackerOne API username (or set H1_USERNAME env var)') | |
| parser.add_argument('--token', type=str, | |
| default=os.environ.get('H1_TOKEN', ''), | |
| help='HackerOne API token (or set H1_TOKEN env var)') | |
| parser.add_argument('--max-programs', type=int, default=None, | |
| help='Max programs to scan (for testing)') | |
| parser.add_argument('--delay', type=float, default=0.1, | |
| help='Delay between API requests in seconds (default: 0.1)') | |
| parser.add_argument('--output', type=str, default='ai_bounty_programs', | |
| help='Output filename prefix') | |
| parser.add_argument('--include-bounty-targets', action='store_true', | |
| help='Also check bounty-targets-data repo for extra coverage') | |
| args = parser.parse_args() | |
| if not args.username or not args.token: | |
| print("Error: HackerOne API credentials required.") | |
| print() | |
| print(" Option 1: Pass as arguments") | |
| print(" python bb_ai_changelog_scraper.py --username YOUR_USER --token YOUR_TOKEN") | |
| print() | |
| print(" Option 2: Set environment variables") | |
| print(" export H1_USERNAME=your_username") | |
| print(" export H1_TOKEN=your_token") | |
| print(" python bb_ai_changelog_scraper.py") | |
| print() | |
| print(" Get your API token at: https://hackerone.com/settings/api_token/edit") | |
| sys.exit(1) | |
| print("=" * 80) | |
| print(" HackerOne AI/LLM Timeline Scanner") | |
| print(" Using official HackerOne Hacker API (policy text + scope dates)") | |
| print(f" Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| print("=" * 80) | |
| api = HackerOneAPI(args.username, args.token, delay=args.delay) | |
| # Verify credentials | |
| print("\n Verifying API credentials...") | |
| test = api._get('/programs', params={'page[size]': 1}) | |
| if test is None: | |
| print(" [!] API request failed. Check your credentials.") | |
| sys.exit(1) | |
| print(" ✓ Credentials valid\n") | |
| # Scan | |
| results = scan_programs( | |
| api, | |
| max_programs=args.max_programs, | |
| include_bounty_targets=args.include_bounty_targets, | |
| ) | |
| # Output | |
| print_program_list(results) | |
| print_trend_timeline(results) | |
| print("\n── Saving files ──────────────────────────────────────────────") | |
| save_csv(results, f"{args.output}.csv") | |
| save_timeline_csv(results, "ai_timeline.csv") | |
| save_json(results, f"{args.output}.json") | |
| print(f"\n API requests made: {api.request_count}") | |
| print(f" Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| print(f" Total programs with AI/LLM: {len(results)}") | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment