Skip to content

Instantly share code, notes, and snippets.

@gabrc52
Last active May 16, 2025 20:03
Show Gist options
  • Save gabrc52/8a5584e326f46a79c9012c7ac801158b to your computer and use it in GitHub Desktop.
Save gabrc52/8a5584e326f46a79c9012c7ac801158b to your computer and use it in GitHub Desktop.
A script to download a range of volumes from MIT's The Tech newspaper in PDF format.
"""
Scraper for The Tech. Here is an example of how to search among issues once they are downloaded:
$ pdfgrep -ri "commons boycott"
$ pdfgrep -ri "kitchen" . -l > kitchen-files.txt
$ cat kitchen-files.txt | xargs pdfgrep -ri "East Campus" -l > and-kitchen-ec-files.txt
"""
import requests
from bs4 import BeautifulSoup
import os
FIRST_VOLUME = 88
LAST_VOLUME = 106
response = requests.get("https://thetech.com/issues")
soup = BeautifulSoup(response.text, 'html.parser')
current_volume = None
issues_by_volume = {}
for x in soup.main.children:
if x.name == 'h2':
current_volume = x.text.split(" ")[-1]
if x.name == 'a':
issues_by_volume.setdefault(current_volume, [])
issues_by_volume[current_volume].append(x.text)
# I found this new URL format that is not documented anywhere but realized you can just go to past issues and they are still there
# What is gone is the pages with individual articles. For old stuff you have to get the whole issue, or go to Wayback machine
def get_url(volume, issue):
return f"https://thetech.com/issues/{volume}/{issue}/pdf"
def tech_to_aws_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.iframe['src']
def write_url_to_file(url, file):
# https://stackoverflow.com/questions/17953210/can-python-requests-fetch-url-directly-to-file-handle-on-disk-like-curl
r = requests.get(url, stream=True)
with open(file, 'wb') as f:
for block in r.iter_content(1024):
f.write(block)
MISSING_URL = "/pdfs/original/missing.png"
def report_missing(volume, issue):
msg = f"Volume {volume}, issue {issue} is missing"
print(msg)
with open("missing.log", "a") as f:
print(msg, file=f)
def download(volume, issue):
print(f"Downloading volume {volume}, issue {issue}")
if not os.path.exists(volume):
os.mkdir(volume)
dest = os.path.join("tech", volume, f"{issue}.pdf")
if os.path.exists(dest):
print("Already downloaded, skipping")
return
remote_url = tech_to_aws_url(get_url(volume, issue))
# BRUH some of them are missing
# (The assumption that all of the missing ones have this URL in the iframe,
# update if you have a counterexample)
if remote_url == MISSING_URL:
report_missing(volume, issue)
return
write_url_to_file(remote_url, dest)
def download_volume(volume):
for issue in issues_by_volume[volume]:
download(volume, issue)
if __name__ == '__main__':
for v in range(FIRST_VOLUME, LAST_VOLUME + 1):
download_volume(str(v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment