Last active
July 30, 2024 15:05
-
-
Save nickali/1ab1ff7ccacd1df531d4bf89a771f071 to your computer and use it in GitHub Desktop.
Block AI and SEO bots, scrapers, spiders, and crawlers with Cloudflare Pages Functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Cloudflare provides the ability to block some AI crawlers and | |
scrapers. They also maintain a list of bots approved to scan | |
sites: https://radar.cloudflare.com/traffic/verified-bots. | |
There are plenty of sites on the verified list I would like to block, | |
plus other SEO crawlers. | |
If you are hosting a site on Cloudflare Pages, here is a function | |
that will block a more expansive number of services. It checks the | |
user agent against a list and if there is a match, it returns a 404. | |
Otherwise, it delivers whatever file was requested. | |
Note, this will run on every single request. Cloudflare's free plan | |
allows for 100,000 free function and worker requests (total) everyday. | |
Create a directory called functions at the root of your repository. | |
Then create a file called _middleware.js in the functions directory | |
and stick this code in there. | |
blockedStrings is the array of strings the user agent is being | |
compared against. The list of user agents were gathered from a | |
whole bunch of different places. Feel free to modify to suit your needs. | |
I'm also open to suggestions on how to make this faster. But anecdotally, | |
I haven't noticed any issues with speed. | |
*/ | |
const blockRequest = async (context) => { | |
let user_agent = context.request.headers.get("user-agent"); | |
let returnResponse; | |
const url = new URL(context.request.url); | |
const blockedUserAgents = ["01h4x.com", "360Spider", "404checker", "404enemy", "80legs", "ADmantX", "AIBOT", "ALittle Client", "ASPSeek", "Abonti", "Aboundex", "Aboundexbot", "Acunetix", "Acunetix Security Scanner", "Acunetix Web Vulnerability Scanner", "AdsBot-Google", "AdsBot-Google-Mobile", "AdsTxtCrawler", "AdsTxtCrawlerTP", "AfD-Verbotsverfahren", "AhrefsBot", "AhrefsSiteBot", "AiHitBot", "Aipbot", "Alexibot", "AllSubmitter", "Alligator", "AlphaBot", "Amazonbot", "Anarchie", "Anarchy", "Anarchy99", "Ankit", "Anthill", "anthropic-ai", "Apexoo", "Applebot-Extended", "Aspiegel", "Asterias", "Attach", "AwarioBot", "AwarioRssBot", "AwarioSmartBot", "Baiduspider-image", "BBBike", "BDCbot", "BDFetch", "BLEXBot", "BackDoorBot", "BackStreet", "BackWeb", "Backlink-Ceck", "BacklinkCrawler", "Badass", "Bandit", "Barkrowler", "BatchFTP", "Battleztar Bazinga", "BetaBot", "Bigfoot", "Bitacle", "Black Hole", "BlackWidow", "Blackboard", "Blow", "BlowFish", "Boardreader", "Bolt", "BotALot", "Botify", "BrandVerity", "Brandprotect", "Brandwatch", "BrightEdge Crawler", "Buck", "Buddy", "BuiltBotTough", "BuiltWith", "Bullseye", "BunnySlippers", "BuzzSumo", "Bytespider", "CATExplorador", "CCBot", "CISPA Webcrawler", "CODE87", "CSHttp", "Calculon", "CazoodleBot", "Cegbfeieh", "CensysInspect", "ChatGPT-User", "CheTeam", "CheeseBot", "CherryPicker", "CherryPickerElite", "CherryPickerSE", "ChinaClaw", "Chlooe", "Cincraw-bot", "Citoid", "Claritybot", "Claude-Web", "ClaudeBot", "Clickagy", "Cliqzbot", "Cloud mapping", "Cocolyzebot", "Cogentbot", "Collector", "ContextAd Bot", "Cookiebot", "Copier", "CopyRightCheck", "Copyscape", "Cosmos", "Craftbot", "Crawling at Home Project", "CrazyWebCrawler", "Crescent", "CriteoBot", "CrunchBot", "Curious", "Custo", "CyotekWebCopy", "DBLBot", "DIIbot", "DSearch", "DTS Agent", "DataCha0s", "DataForSeoBot", "DatabaseDriverMysqli", "Dataprovider.com", "Demon", "Deusu", "Devil", "Diffbot", "Digincore", "DigitalPebble", "Dirbuster", "Disco", "Discobot", "Discoverybot", "Dispatch", "DittoSpyder", "DnBCrawler-Analytics", "DnyzBot", "DomCopBot", "DomainAppender", "DomainCrawler", "DomainSigmaCrawler", "DomainStatsBot", "Domains Project", "Dotbot", "Download Ninja", "Download Wonder", "Dragonfly", "Drip", "ECCP/1.0", "EMail Siphon", "EMail Wolf", "EasyDL", "Ebingbong", "Ecxi", "EirGrabber", "EroCrawler", "Evil", "Exabot", "Express WebPictures", "ExtLinksBot", "Extractor", "ExtractorPro", "Extreme Picture Finder", "EyeNetIE", "Ezooms", "FDM", "FHscan", "FacebookBot", "Facebot", "FairAd Client", "FemtosearchBot", "Fimap", "Firefox/7.0", "Flaming AttackBot", "FlashGet", "Flunky", "Foobot", "Freeuploader", "FrontPage", "Fuzz", "FyberSpider", "Fyrebot", "G-i-g-a-b-o-t", "GPTBot", "GT::WWW", "GalaxyBot", "Genieo", "GermCrawler", "GetRight", "GetWeb", "Getintent", "Gigabot", "Go!Zilla", "Go-Ahead-Got-It", "GoZilla", "Google-Adwords-Instant", "Google-Extended", "GoogleOther", "Googlebot-Image", "Gotit", "GrabNet", "Grabber", "Grafula", "GrapeFX", "GrapeshotCrawler", "GridBot", "HEADMasterSEO", "HMView", "HTMLparser", "HTTP::Lite", "HTTrack", "Haansoft", "HaosouSpider", "Harvest", "Havij", "Heritrix", "Hloader", "HonoluluBot", "Humanlinks", "HybridBot", "IDBTE4M", "IDBot", "IRLbot", "Iblog", "Id-search", "IlseBot", "Image Fetch", "Image Sucker", "ImagesiftBot", "IndeedBot", "Indy Library", "InfoNaviRobot", "InfoTekies", "Information Security Team InfraSec Scanner", "InfraSec Scanner", "Intelliseek", "InterGET", "InternetMeasurement", "InternetSeer", "Internet Ninja", "Iria", "Iskanie", "IstellaBot", "JOC Web Spider", "JamesBOT", "Jbrofuzz", "JennyBot", "JetCar", "Jetty", "JikeSpider", "Joomla", "Jorgee", "JustView", "Jyxobot", "Kenjin Spider", "Keybot Translation-Search-Machine", "Keyword Density", "Kinza", "Kozmosbot", "LNSpiderguy", "LWP::Simple", "Lanshanbot", "Larbin", "Leap", "LeechFTP", "LeechGet", "LexiBot", "Lftp", "LibWeb", "Libwhisker", "LieBaoFast", "Lightspeedsystems", "Likse", "LinkCheck by Siteimprove.com", "LinkScan", "LinkWalker", "Linkbot", "LinkextractorPro", "LinkpadBot", "LinksManager", "LinqiaMetadataDownloaderBot", "LinqiaRSSBot", "LinqiaScrapeBot", "Lipperhey", "Lipperhey Spider", "Litemage_walker", "Lmspider", "Ltx71", "MFC_Tear_Sample", "MIDown tool", "MIIxpc", "MJ12bot", "MQQBrowser", "MSFrontPage", "MSIECrawler", "MSNBot-Media", "AdIdxBot", "MTRobot", "Mag-Net", "Magnet", "Mail.RU_Bot", "Majestic SEO", "Majestic-SEO", "Majestic12", "MarkMonitor", "MarkWatch", "Mass Downloader", "Masscan", "Mata Hari", "MauiBot", "Mb2345Browser", "MeanPath Bot", "Meanpathbot", "Mediatoolkitbot", "Mediapartners-Google", "MegaIndex.ru", "Metauri", "MicroMessenger", "Microsoft Data Access", "Microsoft URL Control", "Minefield", "Mister PiX", "Moblie Safari", "Mojolicious", "MolokaiBot", "Morfeus Fucking Scanner", "Mozlila", "Mr.4x3", "Msrabot", "Musobot", "NICErsPRO", "NPbot", "Name Intelligence", "Nameprotect", "Navroad", "NearSite", "Needle", "Nessus", "NetAnts", "NetLyzer", "NetMechanic", "NetSpider", "NetZIP", "Net Vampire", "Netcraft", "Nettrack", "Netvibes", "NextGenSearchBot", "Nibbler", "Niki-bot", "Nikto", "NimbleCrawler", "Nimbostratus", "Ninja", "Nmap", "Nuclei", "Nutch", "Octopus", "Offline Explorer", "Offline Navigator", "Omgilibot", "OnCrawl", "OpenLinkProfiler", "OpenVAS", "Openfind", "Openvas", "OrangeBot", "OrangeSpider", "Orthogaffe", "OutclicksBot", "OutfoxBot", "PECL::HTTP", "PHPCrawl", "POE-Component-Client-HTTP", "Page Analyzer", "PageGrabber", "PageScorer", "PageThing.com", "PageAnalyzer", "Pandalytics", "Panscient", "Papa Foto", "Pavuk", "PeoplePal", "PerplexityBot", "PiplBot", "Pi-Monster", "Picscout", "Picsearch", "PictureFinder", "Piepmatz", "Pimonster", "Pixray", "PleaseCrawl", "Pockey", "ProPowerBot", "ProWebWalker", "Probethenet", "Proximic", "Psbot", "Pu_iN", "Pump", "PxBroker", "PyCurl", "QueryN Metasearch", "Quick-Crawler", "RSSingBot", "Rainbot", "RankActive", "RankActiveLinkBot", "RankFlex", "RankingBot", "RankingBot2", "Rankivabot", "RankurBot", "Re-re", "ReGet", "RealDownload", "Reaper", "RebelMouse", "Recorder", "RedesScrapy", "RepoMonkey", "RepoMonkey Bait & Tackle", "Riddler", "Ripper", "RocketCrawler", "Rogerbot", "SBIder", "SEOkicks", "SEOkicks-Robot", "SEOlizer", "SEOlyticsCrawler", "SEOprofiler", "SEOstats", "SISTRIX", "SMTBot", "SalesIntelligent", "ScanAlert", "Scanbot", "ScoutJet", "Scrapy", "Screaming", "Screaming Frog SEO Spider", "ScreenerBot", "ScrepyBot", "Searchestate", "SearchmetricsBot", "Seekport", "SeekportBot", "Seekr", "SemanticJuice", "Semrush", "SemrushBot", "SemrushBot-BA", "SemrushBot-BM", "SemrushBot-SA", "SemrushBot-SI", "SemrushBot-SWA", "SemrushBot-CT", "SplitSignalBot", "SemrushBot-COUB", "SentiBot", "SenutoBot", "SeoSiteCheckup", "SeobilityBot", "Seomoz", "Shodan", "SimpleScraper", "Siphon", "SiteAuditBot", "SiteCheckerBotCrawler", "SiteExplorer", "SiteLockSpider", "SiteSnagger", "SiteSucker", "Site Sucker", "Sitebeam", "Siteimprove", "Sitevigil", "SlySearch", "SmartDownload", "Snake", "Snapbot", "Snoopy", "SocialRankIOBot", "Sociscraper", "Sogou", "Sogou inst spider", "Sogou spider2", "Sogou web spider", "Sosospider", "Sottopop", "SpaceBison", "Spammen", "SpankBot", "Spanner", "Spbot", "Spinn3r", "SputnikBot", "Sqlmap", "Sqlworm", "Sqworm", "Steeler", "Storebot-Google", "Stripper", "Sucker", "Sucuri", "SuperBot", "SuperHTTP", "Surfbot", "SurveyBot", "Suzuran", "Swiftbot", "Szukacz", "T0PHackTeam", "Taboolabot", "T8Abot", "TechnicalSEOdotCom", "Teleport", "TeleportPro", "Telesoft", "Telesphoreo", "Telesphorep", "TheNomad", "The Intraformant", "Thumbor", "TightTwatBot", "Timpibot", "TinEye", "TinEye-bot", "TinyTestBot", "Titan", "Toata", "ToutiaoSpider", "Toweyabot", "Traackr.com", "Tracemyfile", "Trendiction", "Trendictionbot", "True_Robot", "Turingos", "Turnitin", "TurnitinBot", "TwengaBot", "Twice", "Typhoeus", "UbiCrawler", "URLy.Warning", "URLy Warning", "UnisterBot", "Upflow", "V-BOT", "VB Project", "VCI", "VCI WebViewer VCI WebViewer Win32", "Vacuum", "Vagabondo", "VelenPublicWebCrawler", "VeriCiteCrawler", "VidibleScraper", "Virusdie", "VoidEYE", "Voil", "Voltron", "WASALive-Bot", "WBSearchBot", "WEBDAV", "WISENutbot", "WPScan", "WWW-Collector-E", "WWW-Mechanize", "WWWOFFLE", "Wallpapers", "Wallpapers/3.0", "WallpapersHD", "WeSEE", "Web Image Collector", "WebAuto", "WebBandit", "WebCollage", "WebCopier", "WebDataStats", "WebEnhancer", "WebFetch", "WebFuck", "WebGo IS", "WebImageCollector", "WebLeacher", "WebPix", "WebReaper", "WebSauger", "WebStripper", "WebSucker", "WebWhacker", "WebZIP", "Web Auto", "Web Collage", "Web Enhancer", "Web Fetch", "Web Fuck", "Web Pix", "Web Sauger", "Web Sucker", "Webalta", "WebmasterWorldForumBot", "Webshag", "WebsiteExtractor", "WebsiteQuester", "Website Quester", "Webster", "Whack", "Whacker", "Whatweb", "Who.is Bot", "Widow", "WinHTTrack", "WiseGuys Robot", "Wonderbot", "Woobot", "Wotbox", "Wprecon", "Xaldon WebSpider", "Xaldon_WebSpider", "Xenu", "YaK", "YouBot", "YoudaoBot", "Zade", "Zauba", "Zealbot", "Zermelo", "Zeus", "Zitebot", "ZmEu", "ZoomBot", "ZoominfoBot", "ZumBot", "ZyBorg", "adbeat_bot", "adscanner", "anthropic-ai", "arquivo-web-crawler", "arquivo.pt", "autoemailspider", "awario.com", "backlink-check", "cah.io.community", "check1.exe", "clark-crawler", "coccoc", "coccocbot", "coccocbot-web", "cognitiveseo", "cohere-ai", "com.plumanalytics", "crawl.sogou.com", "crawler.feedback", "crawler4j", "dataforseo.com", "dataforseobot", "demandbase-bot", "domainsproject.org", "eCatch", "evc-batch", "facebookscraper", "gopher", "heritrix", "hypestat", "ia_archiver", "imagesift.com", "instabid", "internetVista monitor", "ips-agent", "isitwp.com", "iubenda-radar", "jpg-newsbot", "linkdexbot", "linkfluence", "lwp-request", "lwp-trivial", "magpie-crawler", "meanpathbot", "mediawords", "mlbot", "moget", "muhstik-scan", "netEstate NE Crawler", "oBot", "omgili", "online-webceo-bot", "openai", "openai.com", "outbrain", "page scorer", "pcBrowser", "peer39_crawler", "peer39_crawler/1.0", "plumanalytics", "polaris version", "probe-image-size", "ripz", "s1z.ru", "satoristudio.net", "scalaj-http", "scan.lol", "seobility", "seocompany.store", "seoscanners", "seostar", "serpstatbot", "sexsearcher", "sitechecker.pro", "siteripz", "sogouspider", "sp_auditbot", "spyfu", "sysscan", "tAkeOut", "trendiction.com", "trendiction.de", "ubermetrics-technologies.com", "voyagerx.com", "webgains-bot", "webmeup-crawler", "webpros.com", "webprosbot", "x09Mozilla", "x22Mozilla", "xpymep1.exe", "zauba.io", "zgrab"]; | |
Feel free to ask if you need any more help! | |
if (blockedUserAgents.some(v => user_agent.includes(v))) { | |
returnResponse = new Response('Not found', { status: 404 }); | |
} else { | |
const asset = await context.env.ASSETS.fetch(url); | |
returnResponse = new Response(asset.body, asset); | |
} | |
return returnResponse; | |
}; | |
export const onRequest = [blockRequest] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated with more user agents to block.