Skip to content

Instantly share code, notes, and snippets.

@mentix02
Last active August 5, 2025 09:31
Show Gist options
  • Select an option

  • Save mentix02/74075ab7bf9d39428a0cfb710980ab75 to your computer and use it in GitHub Desktop.

Select an option

Save mentix02/74075ab7bf9d39428a0cfb710980ab75 to your computer and use it in GitHub Desktop.
Scrapes Earth911. Run `pip install httpx bs4` to run the script.
#!/usr/bin/env python3
"""
File: earth911.py
Author: mentix02 (Manan [email protected])
Task: Scrape search results of search.earch911.com and extract data into a CSV file.
Making a request to search.earth911.com by spoofing the headers to make it look like you're coming from a
browser. We are returned with plain HTML - no JSON API to reverse engineer unfortunately. Parsing this HTML,
we get a list of hrefs over which we loop (asynchronously, of course) and fetch details of the individual
centre (in HTML - from which we extract a dictionary of properties).
These are, in turn, written to an output CSV file (either stdout or a file provided via sys.argv).
Design Model: sources and sinks
Sources: list of hrefs + original link with pagination
Sinks: consumer of href to fetch centre details + CSV writer writing final output
1. Fetch all hrefs from `n` pages; where n is an int provided by user (default: 1) (max: 20)
2. Loops over all these hrefs and makes requests to each (concurrently)
3. Extracts data from HTML of individual centre page
4. Writes data to outfile (CSV or stdout) (default: stdout)
"""
import sys
import csv
import random
import asyncio
import argparse
from urllib.parse import urlencode, urlparse
from typing import List, Dict, Union, Optional, TypedDict, Sequence, TextIO
import httpx
from bs4 import BeautifulSoup
__version__ = "0.0.1"
__author__ = "mentix02 (Manan [email protected])"
# Types
class RecyclingCentre(TypedDict):
url: str
business_name: str
street_address: str
last_update_date: str
materials_accepted: str
class FetchError(Exception):
"""
Exception raised when there is an error fetching (or parsing) data from a URL.
"""
def __init__(self, message: str, url: str):
super().__init__(message)
self.url = url
# Config Singleton
class Config:
HTTP_CLIENT: httpx.AsyncClient
BASE_URL: str = "https://search.earth911.com"
USER_AGENTS: List[str] = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
]
URL_PARAMS: Dict[str, str] = {
"what": "Electronics",
"where": "10001",
"list_filter": "all",
"max_distance": "100",
}
# Private functions
def _create_headers() -> Dict[str, str]:
"""
Randomize the User-Agent to avoid detection.
"""
return {"User-Agent": random.choice(Config.USER_AGENTS)}
def _create_argparser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="earth911",
description="Fetches data from earth911.com",
epilog=f"Created by {__author__}",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
parser.add_argument("-n", "--number", type=int, help="Number of pages to scrape", default=3)
parser.add_argument(
"outfile",
nargs="?",
type=argparse.FileType("w+"),
default=sys.stdout,
help="CSF file to write output",
)
return parser
def _create_writer(outfile: TextIO) -> csv.DictWriter:
writer = csv.DictWriter(
outfile,
fieldnames=[
"business_name",
"street_address",
"last_update_date",
"materials_accepted",
"url",
],
)
writer.writeheader()
return writer
# Async Iterators
class CentreURLsIterator:
"""
To be used as:
>>> aysnc for centre_links in CentreURLsIterator(1): ...
>>> async for centre_links in CentreURLsIterator(1, 15): ...
Where centre_links is a list of strings where each string is the URL to the
"detail" page of a centre: from which we can find / extract more info: by
making a request and parsing the response of each link.
Note: the `start` & 'end' params expect the `page` argument from the URL's query string:
so the behaviour for passing 0 to `start` is undefined.
"""
def __init__(self, start: int, end: Optional[int] = None):
self.current = start
self.end = end if end is not None else start
@staticmethod
def _make_search_url(page: int = 1) -> str:
query_string = urlencode(Config.URL_PARAMS | {"page": page})
return f"{Config.BASE_URL}?{query_string}"
@staticmethod
def _make_detail_url(href: str) -> str:
parsed_url = urlparse(Config.BASE_URL + href)
return parsed_url._replace(query="").geturl()
def __aiter__(self):
return self
async def __anext__(self) -> Sequence[str]:
if self.current > self.end:
raise StopAsyncIteration
try:
headers = _create_headers()
url = self._make_search_url(self.current)
response = await Config.HTTP_CLIENT.get(url, headers=headers)
response.raise_for_status()
# Extract hrefs from the response HTML
hrefs: List[str] = []
soup = BeautifulSoup(response.text, "html.parser")
for result_item in soup.find_all("li", {"class": "result-item"}):
href: str = result_item.find("h2", {"class": "title"}).find("a")["href"] # type: ignore
hrefs.append(self._make_detail_url(href))
self.current += 1
return hrefs
except Exception as e:
raise FetchError(f"Error fetching hrefs from URL '{url}': {e}", url) from e
class CentreDetailsIterator:
"""
To be used as:
>>> async for centre_details in CentreDetailsIterator(centre_links): ...
Where centre_details is a RecyclingCentre object with the details of a centre,
and centre_links is a list of strings where each string is the URL to the
"detail" page of a centre: which you can get from CentreURLsIterator.
"""
def __init__(self, centre_links: Sequence[str]):
self.centre_links = centre_links
self._idx = 0
self._fetched = False
self._items: Sequence[Union[RecyclingCentre, BaseException]] = []
@staticmethod
async def _get_centre_details(url: str) -> RecyclingCentre:
headers = _create_headers()
try:
response = await Config.HTTP_CLIENT.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
detail_body = soup.find("div", {"id": "search-details"})
if detail_body is None:
raise ValueError(f"Could not find details for URL: {url}")
header = detail_body.find("h1") # type: ignore
centre_name: str = list(header.children)[0].text # type: ignore
last_update_date: str = header.find("span", {"class": "last-verified"}).text.strip().split()[-1] # type: ignore
materials_accepted = detail_body.find_all("span", {"class": "material"}) # type: ignore
materials_accepted = ", ".join(material.text.strip() for material in materials_accepted)
address = " ".join(
sub_address.text.strip()
for sub_address in detail_body.find("div", {"class": "contact"}).find_all("p", {"class": "addr"}) # type: ignore
)
return {
"url": url,
"street_address": address,
"business_name": centre_name,
"last_update_date": last_update_date,
"materials_accepted": materials_accepted,
}
except Exception as e:
raise FetchError(f"Error fetching details from '{url}': {e}", url) from e
async def get_all_details(self) -> Sequence[Union[RecyclingCentre, BaseException]]:
"""
Fetches all details of the centres in the centre_links list.
"""
if self._fetched:
return self._items
tasks = [self._get_centre_details(link) for link in self.centre_links]
self._items = await asyncio.gather(*tasks, return_exceptions=True)
self._fetched = True
return self._items
def __aiter__(self):
return self
async def __anext__(self) -> RecyclingCentre:
if not self._fetched:
await self.get_all_details()
try:
centre = self._items[self._idx]
self._idx += 1
if isinstance(centre, BaseException):
assert isinstance(centre, FetchError), "Iterators should only throw FetchErrors"
return {
"url": centre.url,
"street_address": "",
"last_update_date": "",
"materials_accepted": "",
"business_name": str(centre),
}
return centre
except IndexError:
raise StopAsyncIteration
async def main() -> int:
parser = _create_argparser()
args = parser.parse_args()
outfile = args.outfile
writer = _create_writer(outfile)
number_of_pages: int = args.number
async with httpx.AsyncClient(timeout=None) as client:
Config.HTTP_CLIENT = client
async for centre_links in CentreURLsIterator(1, number_of_pages):
async for centre_details in CentreDetailsIterator(centre_links):
writer.writerow(centre_details)
outfile.flush()
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(main()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment