Skip to content

Instantly share code, notes, and snippets.

@projectgus
Last active August 1, 2024 10:31
Show Gist options
  • Save projectgus/5a495c94f70ac3b3783e89c74a1fc07a to your computer and use it in GitHub Desktop.
Save projectgus/5a495c94f70ac3b3783e89c74a1fc07a to your computer and use it in GitHub Desktop.
Script to update robots.txt from darkvisitors.com API, with optional extra robots.txt contents and bunny.net CDN invalidation
update_robots.env
# Template EnvironmentFile suitable for the systemd service
# Path to http server local root directory
HTTP_DIR=
# (Optional) Path to extra robots.txt content to append after the darkvisitors content
# Can be relative to HTTP_DIR or absolute path.
EXTRA_ROBOTS=
# darkvisitors.com API bearer token
DARKVISITORS_TOKEN=
# (Optional) bunny.net API auth token
BUNNY_TOKEN=
# (Optional) Public website URL for bunny.net cache purge
BUNNY_URL=
#!/usr/bin/env python
# Copyright (c) 2024 Angus Gratton
# SPDX-License-Identifier: Apache-2.0 OR MIT
#
# Very simple Python program to:
#
# - Rebuild contents of robots.txt from the darkvisitors.com API, see
# https://darkvisitors.com/docs/robots-txt
#
# - Optionally append some "extra" robots.txt content from a static file each time.
#
# - Optionally trigger a bunny.net CDN invalidation of the cached copy of robots.txt
#
# Uses the python requests package.
#
# Note there's nothing very fancy here, this could easily be a shell script with a few curl commands.
#
# Execute by running (from cron, systemd timer, etc) with the environment
# variables shown in update_robots.env.sample set. Or you can edit them in below
# if you feel like it, I won't judge.
import json
import os
import os.path
import requests
HTTP_DIR = os.environ["HTTP_DIR"]
DARKVISITORS_TOKEN = os.environ["DARKVISITORS_TOKEN"]
BUNNY_TOKEN = os.environ.get("BUNNY_TOKEN", None)
BUNNY_URL = os.environ.get("BUNNY_URL", None)
EXTRA_ROBOTS = os.environ.get("EXTRA_ROBOTS", None)
def fetch_visitor_robots():
print("Fetching latest AI user agents list...")
r = requests.post(
"https://api.darkvisitors.com/robots-txts",
headers={
"Authorization": f"Bearer {DARKVISITORS_TOKEN}",
"Content-Type": "application/json",
},
data=json.dumps(
{
"agent_types": ["AI Data Scraper", "Undocumented AI Agent"],
"disallow": "/",
}
),
)
r.raise_for_status()
return r.text
def read_extra_robots():
if EXTRA_ROBOTS:
path = os.path.join(HTTP_DIR, EXTRA_ROBOTS)
print(f"Reading extra robots.txt content from {path}...")
with open(path, "r") as f:
return "\n\n" + f.read()
else:
return ""
def invalidate_bunny_cache():
print(f"Invalidating bunny.net cache of {BUNNY_URL}...")
r = requests.post(
"https://api.bunny.net/purge",
params={"url": BUNNY_URL},
headers={"AccessKey": BUNNY_TOKEN},
)
r.raise_for_status()
def main():
visitor_robots = fetch_visitor_robots()
extra_robots = read_extra_robots()
path = os.path.join(HTTP_DIR, "robots.txt")
new_content = visitor_robots + extra_robots
old_content = None
try:
with open(path, "r") as f:
old_content = f.read()
except FileNotFoundError:
pass
if old_content == new_content:
print("Contents unchanged.")
return
print(f"Writing {path}...")
with open(path, "w") as f:
f.write(new_content)
if BUNNY_URL:
invalidate_bunny_cache()
print("Done")
if __name__ == "__main__":
main()
[Unit]
Description=Update robots.txt AI entries
[Service]
Type=simple
ExecStart=%h/update_robots/update_robots.py
EnvironmentFile=%Y/update_robots.env
[Unit]
Description=Update robots.txt AI entries
[Timer]
OnBootSec=15m
OnUnitActiveSec=1h
RandomizedDelaySec=5m
Persistent=true
[Install]
WantedBy=timers.target
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment