gabrc52 · May 16, 2025 20:03
diff --git a/tech.py b/tech.py
 """
 Scraper for The Tech. Here is an example of how to search among issues once they are downloaded:

 $ pdfgrep -ri "commons boycott"

 $ pdfgrep -ri "kitchen" . -l > kitchen-files.txt
 $ cat kitchen-files.txt | xargs pdfgrep -ri "East Campus" -l > and-kitchen-ec-files.txt
 """

 import requests
 from bs4 import BeautifulSoup
 import os

 FIRST_VOLUME = 88
 LAST_VOLUME = 106

 response = requests.get("https://thetech.com/issues")

 soup = BeautifulSoup(response.text, 'html.parser')

 current_volume = None
 issues_by_volume = {}
 for x in soup.main.children:
    if x.name == 'h2':
        current_volume = x.text.split(" ")[-1]
    if x.name == 'a':
        issues_by_volume.setdefault(current_volume, [])
        issues_by_volume[current_volume].append(x.text)

 # I found this new URL format that is not documented anywhere but realized you can just go to past issues and they are still there
 # What is gone is the pages with individual articles. For old stuff you have to get the whole issue, or go to Wayback machine
 def get_url(volume, issue):
    return f"https://thetech.com/issues/{volume}/{issue}/pdf"

 def tech_to_aws_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.iframe['src']

 def write_url_to_file(url, file):
    # https://stackoverflow.com/questions/17953210/can-python-requests-fetch-url-directly-to-file-handle-on-disk-like-curl
    r = requests.get(url, stream=True)
    with open(file, 'wb') as f:
        for block in r.iter_content(1024):
            f.write(block)

 MISSING_URL = "/pdfs/original/missing.png"

 def report_missing(volume, issue):
    msg = f"Volume {volume}, issue {issue} is missing"
    print(msg)
    with open("missing.log", "a") as f:
        print(msg, file=f)

 def download(volume, issue):
    print(f"Downloading volume {volume}, issue {issue}")
    if not os.path.exists(volume):
        os.mkdir(volume)
    dest = os.path.join("tech", volume, f"{issue}.pdf")
    if os.path.exists(dest):
        print("Already downloaded, skipping")
        return
    remote_url = tech_to_aws_url(get_url(volume, issue))
    # BRUH some of them are missing
    # (The assumption that all of the missing ones have this URL in the iframe,
    #  update if you have a counterexample)
    if remote_url == MISSING_URL:
        report_missing(volume, issue)
        return
    write_url_to_file(remote_url, dest)

 def download_volume(volume):
    for issue in issues_by_volume[volume]:
        download(volume, issue)

 if __name__ == '__main__':
    for v in range(FIRST_VOLUME, LAST_VOLUME + 1):
        download_volume(str(v))
	"""
	Scraper for The Tech. Here is an example of how to search among issues once they are downloaded:

	$ pdfgrep -ri "commons boycott"

	$ pdfgrep -ri "kitchen" . -l > kitchen-files.txt
	$ cat kitchen-files.txt \| xargs pdfgrep -ri "East Campus" -l > and-kitchen-ec-files.txt
	"""

	import requests
	from bs4 import BeautifulSoup
	import os

	FIRST_VOLUME = 88
	LAST_VOLUME = 106

	response = requests.get("https://thetech.com/issues")

	soup = BeautifulSoup(response.text, 'html.parser')

	current_volume = None
	issues_by_volume = {}
	for x in soup.main.children:
	if x.name == 'h2':
	current_volume = x.text.split(" ")[-1]
	if x.name == 'a':
	issues_by_volume.setdefault(current_volume, [])
	issues_by_volume[current_volume].append(x.text)

	# I found this new URL format that is not documented anywhere but realized you can just go to past issues and they are still there
	# What is gone is the pages with individual articles. For old stuff you have to get the whole issue, or go to Wayback machine
	def get_url(volume, issue):
	return f"https://thetech.com/issues/{volume}/{issue}/pdf"

	def tech_to_aws_url(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	return soup.iframe['src']

	def write_url_to_file(url, file):
	# https://stackoverflow.com/questions/17953210/can-python-requests-fetch-url-directly-to-file-handle-on-disk-like-curl
	r = requests.get(url, stream=True)
	with open(file, 'wb') as f:
	for block in r.iter_content(1024):
	f.write(block)

	MISSING_URL = "/pdfs/original/missing.png"

	def report_missing(volume, issue):
	msg = f"Volume {volume}, issue {issue} is missing"
	print(msg)
	with open("missing.log", "a") as f:
	print(msg, file=f)

	def download(volume, issue):
	print(f"Downloading volume {volume}, issue {issue}")
	if not os.path.exists(volume):
	os.mkdir(volume)
	dest = os.path.join("tech", volume, f"{issue}.pdf")
	if os.path.exists(dest):
	print("Already downloaded, skipping")
	return
	remote_url = tech_to_aws_url(get_url(volume, issue))
	# BRUH some of them are missing
	# (The assumption that all of the missing ones have this URL in the iframe,
	# update if you have a counterexample)
	if remote_url == MISSING_URL:
	report_missing(volume, issue)
	return
	write_url_to_file(remote_url, dest)

	def download_volume(volume):
	for issue in issues_by_volume[volume]:
	download(volume, issue)

	if __name__ == '__main__':
	for v in range(FIRST_VOLUME, LAST_VOLUME + 1):
	download_volume(str(v))