Created
April 7, 2026 03:44
-
-
Save stanwu/6b30392ec7412b0ee1496c28104a701e to your computer and use it in GitHub Desktop.
Check whether Cloudflare appears to block specific user agents.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import re | |
| import sys | |
| import urllib.error | |
| import urllib.parse | |
| import urllib.request | |
| USER_AGENTS = { | |
| "browser_chrome": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/136.0.0.0 Safari/537.36" | |
| ), | |
| "googlebot": ( | |
| "Mozilla/5.0 (compatible; Googlebot/2.1; " | |
| "+http://www.google.com/bot.html)" | |
| ), | |
| "gptbot": "GPTBot/1.2 (+https://openai.com/gptbot)", | |
| "claudebot": "ClaudeBot/1.0 (+https://www.anthropic.com/claudebot)", | |
| "perplexitybot": "PerplexityBot/1.0 (+https://www.perplexity.ai/perplexitybot)", | |
| "curl": "curl/8.7.1", | |
| } | |
| CF_HEADERS = [ | |
| "server", | |
| "cf-ray", | |
| "cf-cache-status", | |
| "cf-mitigated", | |
| "location", | |
| "content-type", | |
| ] | |
| def looks_like_cloudflare_block(status, body, headers): | |
| server = (headers.get("server") or "").lower() | |
| body_l = body.lower() | |
| return any( | |
| [ | |
| status in (403, 429, 503), | |
| "cloudflare" in server and "attention required" in body_l, | |
| "cf-chl" in body_l, | |
| "challenge-platform" in body_l, | |
| "sorry, you have been blocked" in body_l, | |
| headers.get("cf-mitigated"), | |
| ] | |
| ) | |
| def title_from_html(body): | |
| match = re.search(r"<title[^>]*>(.*?)</title>", body, re.I | re.S) | |
| if not match: | |
| return "" | |
| return re.sub(r"\s+", " ", match.group(1)).strip() | |
| def fetch(url, user_agent, timeout): | |
| request = urllib.request.Request( | |
| url, | |
| headers={ | |
| "User-Agent": user_agent, | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| }, | |
| ) | |
| opener = urllib.request.build_opener(urllib.request.HTTPRedirectHandler()) | |
| try: | |
| with opener.open(request, timeout=timeout) as response: | |
| body_bytes = response.read(32768) | |
| body = body_bytes.decode("utf-8", errors="replace") | |
| return { | |
| "ok": True, | |
| "status": response.status, | |
| "final_url": response.geturl(), | |
| "headers": response.headers, | |
| "body": body, | |
| "error": "", | |
| } | |
| except urllib.error.HTTPError as exc: | |
| body = exc.read(32768).decode("utf-8", errors="replace") | |
| return { | |
| "ok": False, | |
| "status": exc.code, | |
| "final_url": exc.geturl(), | |
| "headers": exc.headers, | |
| "body": body, | |
| "error": f"HTTPError {exc.code}", | |
| } | |
| except Exception as exc: # noqa: BLE001 | |
| return { | |
| "ok": False, | |
| "status": None, | |
| "final_url": "", | |
| "headers": {}, | |
| "body": "", | |
| "error": repr(exc), | |
| } | |
| def print_result(name, result): | |
| headers = result["headers"] | |
| title = title_from_html(result["body"]) | |
| cf_block = looks_like_cloudflare_block(result["status"], result["body"], headers) | |
| print(f"=== {name} ===") | |
| print(f"status: {result['status']}") | |
| print(f"final_url: {result['final_url'] or '-'}") | |
| print(f"cloudflare_block_suspected: {'yes' if cf_block else 'no'}") | |
| if title: | |
| print(f"title: {title}") | |
| if result["error"]: | |
| print(f"error: {result['error']}") | |
| for header in CF_HEADERS: | |
| value = headers.get(header) | |
| if value: | |
| print(f"{header}: {value}") | |
| if result["body"]: | |
| snippet = re.sub(r"\s+", " ", result["body"][:240]).strip() | |
| print(f"body_snippet: {snippet}") | |
| print() | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Check whether Cloudflare appears to block specific user agents." | |
| ) | |
| parser.add_argument("--url", required=True, help="URL to test") | |
| parser.add_argument( | |
| "--agent", | |
| choices=["all", *USER_AGENTS.keys()], | |
| default="all", | |
| help="Agent profile to test", | |
| ) | |
| parser.add_argument( | |
| "--timeout", | |
| type=float, | |
| default=15.0, | |
| help="Request timeout in seconds", | |
| ) | |
| args = parser.parse_args() | |
| url = args.url | |
| parsed = urllib.parse.urlparse(url) | |
| if parsed.scheme not in ("http", "https"): | |
| print("error: --url must start with http:// or https://", file=sys.stderr) | |
| raise SystemExit(2) | |
| agents = USER_AGENTS.items() if args.agent == "all" else [(args.agent, USER_AGENTS[args.agent])] | |
| for name, user_agent in agents: | |
| result = fetch(url, user_agent, args.timeout) | |
| print_result(name, result) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment