Last active
May 16, 2025 20:03
-
-
Save gabrc52/8a5584e326f46a79c9012c7ac801158b to your computer and use it in GitHub Desktop.
A script to download a range of volumes from MIT's The Tech newspaper in PDF format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Scraper for The Tech. Here is an example of how to search among issues once they are downloaded: | |
$ pdfgrep -ri "commons boycott" | |
$ pdfgrep -ri "kitchen" . -l > kitchen-files.txt | |
$ cat kitchen-files.txt | xargs pdfgrep -ri "East Campus" -l > and-kitchen-ec-files.txt | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
FIRST_VOLUME = 88 | |
LAST_VOLUME = 106 | |
response = requests.get("https://thetech.com/issues") | |
soup = BeautifulSoup(response.text, 'html.parser') | |
current_volume = None | |
issues_by_volume = {} | |
for x in soup.main.children: | |
if x.name == 'h2': | |
current_volume = x.text.split(" ")[-1] | |
if x.name == 'a': | |
issues_by_volume.setdefault(current_volume, []) | |
issues_by_volume[current_volume].append(x.text) | |
# I found this new URL format that is not documented anywhere but realized you can just go to past issues and they are still there | |
# What is gone is the pages with individual articles. For old stuff you have to get the whole issue, or go to Wayback machine | |
def get_url(volume, issue): | |
return f"https://thetech.com/issues/{volume}/{issue}/pdf" | |
def tech_to_aws_url(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
return soup.iframe['src'] | |
def write_url_to_file(url, file): | |
# https://stackoverflow.com/questions/17953210/can-python-requests-fetch-url-directly-to-file-handle-on-disk-like-curl | |
r = requests.get(url, stream=True) | |
with open(file, 'wb') as f: | |
for block in r.iter_content(1024): | |
f.write(block) | |
MISSING_URL = "/pdfs/original/missing.png" | |
def report_missing(volume, issue): | |
msg = f"Volume {volume}, issue {issue} is missing" | |
print(msg) | |
with open("missing.log", "a") as f: | |
print(msg, file=f) | |
def download(volume, issue): | |
print(f"Downloading volume {volume}, issue {issue}") | |
if not os.path.exists(volume): | |
os.mkdir(volume) | |
dest = os.path.join("tech", volume, f"{issue}.pdf") | |
if os.path.exists(dest): | |
print("Already downloaded, skipping") | |
return | |
remote_url = tech_to_aws_url(get_url(volume, issue)) | |
# BRUH some of them are missing | |
# (The assumption that all of the missing ones have this URL in the iframe, | |
# update if you have a counterexample) | |
if remote_url == MISSING_URL: | |
report_missing(volume, issue) | |
return | |
write_url_to_file(remote_url, dest) | |
def download_volume(volume): | |
for issue in issues_by_volume[volume]: | |
download(volume, issue) | |
if __name__ == '__main__': | |
for v in range(FIRST_VOLUME, LAST_VOLUME + 1): | |
download_volume(str(v)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment