Last active
August 1, 2024 10:31
-
-
Save projectgus/5a495c94f70ac3b3783e89c74a1fc07a to your computer and use it in GitHub Desktop.
Script to update robots.txt from darkvisitors.com API, with optional extra robots.txt contents and bunny.net CDN invalidation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
update_robots.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Template EnvironmentFile suitable for the systemd service | |
# Path to http server local root directory | |
HTTP_DIR= | |
# (Optional) Path to extra robots.txt content to append after the darkvisitors content | |
# Can be relative to HTTP_DIR or absolute path. | |
EXTRA_ROBOTS= | |
# darkvisitors.com API bearer token | |
DARKVISITORS_TOKEN= | |
# (Optional) bunny.net API auth token | |
BUNNY_TOKEN= | |
# (Optional) Public website URL for bunny.net cache purge | |
BUNNY_URL= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Copyright (c) 2024 Angus Gratton | |
# SPDX-License-Identifier: Apache-2.0 OR MIT | |
# | |
# Very simple Python program to: | |
# | |
# - Rebuild contents of robots.txt from the darkvisitors.com API, see | |
# https://darkvisitors.com/docs/robots-txt | |
# | |
# - Optionally append some "extra" robots.txt content from a static file each time. | |
# | |
# - Optionally trigger a bunny.net CDN invalidation of the cached copy of robots.txt | |
# | |
# Uses the python requests package. | |
# | |
# Note there's nothing very fancy here, this could easily be a shell script with a few curl commands. | |
# | |
# Execute by running (from cron, systemd timer, etc) with the environment | |
# variables shown in update_robots.env.sample set. Or you can edit them in below | |
# if you feel like it, I won't judge. | |
import json | |
import os | |
import os.path | |
import requests | |
HTTP_DIR = os.environ["HTTP_DIR"] | |
DARKVISITORS_TOKEN = os.environ["DARKVISITORS_TOKEN"] | |
BUNNY_TOKEN = os.environ.get("BUNNY_TOKEN", None) | |
BUNNY_URL = os.environ.get("BUNNY_URL", None) | |
EXTRA_ROBOTS = os.environ.get("EXTRA_ROBOTS", None) | |
def fetch_visitor_robots(): | |
print("Fetching latest AI user agents list...") | |
r = requests.post( | |
"https://api.darkvisitors.com/robots-txts", | |
headers={ | |
"Authorization": f"Bearer {DARKVISITORS_TOKEN}", | |
"Content-Type": "application/json", | |
}, | |
data=json.dumps( | |
{ | |
"agent_types": ["AI Data Scraper", "Undocumented AI Agent"], | |
"disallow": "/", | |
} | |
), | |
) | |
r.raise_for_status() | |
return r.text | |
def read_extra_robots(): | |
if EXTRA_ROBOTS: | |
path = os.path.join(HTTP_DIR, EXTRA_ROBOTS) | |
print(f"Reading extra robots.txt content from {path}...") | |
with open(path, "r") as f: | |
return "\n\n" + f.read() | |
else: | |
return "" | |
def invalidate_bunny_cache(): | |
print(f"Invalidating bunny.net cache of {BUNNY_URL}...") | |
r = requests.post( | |
"https://api.bunny.net/purge", | |
params={"url": BUNNY_URL}, | |
headers={"AccessKey": BUNNY_TOKEN}, | |
) | |
r.raise_for_status() | |
def main(): | |
visitor_robots = fetch_visitor_robots() | |
extra_robots = read_extra_robots() | |
path = os.path.join(HTTP_DIR, "robots.txt") | |
new_content = visitor_robots + extra_robots | |
old_content = None | |
try: | |
with open(path, "r") as f: | |
old_content = f.read() | |
except FileNotFoundError: | |
pass | |
if old_content == new_content: | |
print("Contents unchanged.") | |
return | |
print(f"Writing {path}...") | |
with open(path, "w") as f: | |
f.write(new_content) | |
if BUNNY_URL: | |
invalidate_bunny_cache() | |
print("Done") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Update robots.txt AI entries | |
[Service] | |
Type=simple | |
ExecStart=%h/update_robots/update_robots.py | |
EnvironmentFile=%Y/update_robots.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Update robots.txt AI entries | |
[Timer] | |
OnBootSec=15m | |
OnUnitActiveSec=1h | |
RandomizedDelaySec=5m | |
Persistent=true | |
[Install] | |
WantedBy=timers.target |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment