Last active
August 5, 2025 09:31
-
-
Save mentix02/74075ab7bf9d39428a0cfb710980ab75 to your computer and use it in GitHub Desktop.
Scrapes Earth911. Run `pip install httpx bs4` to run the script.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| File: earth911.py | |
| Author: mentix02 (Manan [email protected]) | |
| Task: Scrape search results of search.earch911.com and extract data into a CSV file. | |
| Making a request to search.earth911.com by spoofing the headers to make it look like you're coming from a | |
| browser. We are returned with plain HTML - no JSON API to reverse engineer unfortunately. Parsing this HTML, | |
| we get a list of hrefs over which we loop (asynchronously, of course) and fetch details of the individual | |
| centre (in HTML - from which we extract a dictionary of properties). | |
| These are, in turn, written to an output CSV file (either stdout or a file provided via sys.argv). | |
| Design Model: sources and sinks | |
| Sources: list of hrefs + original link with pagination | |
| Sinks: consumer of href to fetch centre details + CSV writer writing final output | |
| 1. Fetch all hrefs from `n` pages; where n is an int provided by user (default: 1) (max: 20) | |
| 2. Loops over all these hrefs and makes requests to each (concurrently) | |
| 3. Extracts data from HTML of individual centre page | |
| 4. Writes data to outfile (CSV or stdout) (default: stdout) | |
| """ | |
| import sys | |
| import csv | |
| import random | |
| import asyncio | |
| import argparse | |
| from urllib.parse import urlencode, urlparse | |
| from typing import List, Dict, Union, Optional, TypedDict, Sequence, TextIO | |
| import httpx | |
| from bs4 import BeautifulSoup | |
| __version__ = "0.0.1" | |
| __author__ = "mentix02 (Manan [email protected])" | |
| # Types | |
| class RecyclingCentre(TypedDict): | |
| url: str | |
| business_name: str | |
| street_address: str | |
| last_update_date: str | |
| materials_accepted: str | |
| class FetchError(Exception): | |
| """ | |
| Exception raised when there is an error fetching (or parsing) data from a URL. | |
| """ | |
| def __init__(self, message: str, url: str): | |
| super().__init__(message) | |
| self.url = url | |
| # Config Singleton | |
| class Config: | |
| HTTP_CLIENT: httpx.AsyncClient | |
| BASE_URL: str = "https://search.earth911.com" | |
| USER_AGENTS: List[str] = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", | |
| ] | |
| URL_PARAMS: Dict[str, str] = { | |
| "what": "Electronics", | |
| "where": "10001", | |
| "list_filter": "all", | |
| "max_distance": "100", | |
| } | |
| # Private functions | |
| def _create_headers() -> Dict[str, str]: | |
| """ | |
| Randomize the User-Agent to avoid detection. | |
| """ | |
| return {"User-Agent": random.choice(Config.USER_AGENTS)} | |
| def _create_argparser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser( | |
| prog="earth911", | |
| description="Fetches data from earth911.com", | |
| epilog=f"Created by {__author__}", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
| ) | |
| parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") | |
| parser.add_argument("-n", "--number", type=int, help="Number of pages to scrape", default=3) | |
| parser.add_argument( | |
| "outfile", | |
| nargs="?", | |
| type=argparse.FileType("w+"), | |
| default=sys.stdout, | |
| help="CSF file to write output", | |
| ) | |
| return parser | |
| def _create_writer(outfile: TextIO) -> csv.DictWriter: | |
| writer = csv.DictWriter( | |
| outfile, | |
| fieldnames=[ | |
| "business_name", | |
| "street_address", | |
| "last_update_date", | |
| "materials_accepted", | |
| "url", | |
| ], | |
| ) | |
| writer.writeheader() | |
| return writer | |
| # Async Iterators | |
| class CentreURLsIterator: | |
| """ | |
| To be used as: | |
| >>> aysnc for centre_links in CentreURLsIterator(1): ... | |
| >>> async for centre_links in CentreURLsIterator(1, 15): ... | |
| Where centre_links is a list of strings where each string is the URL to the | |
| "detail" page of a centre: from which we can find / extract more info: by | |
| making a request and parsing the response of each link. | |
| Note: the `start` & 'end' params expect the `page` argument from the URL's query string: | |
| so the behaviour for passing 0 to `start` is undefined. | |
| """ | |
| def __init__(self, start: int, end: Optional[int] = None): | |
| self.current = start | |
| self.end = end if end is not None else start | |
| @staticmethod | |
| def _make_search_url(page: int = 1) -> str: | |
| query_string = urlencode(Config.URL_PARAMS | {"page": page}) | |
| return f"{Config.BASE_URL}?{query_string}" | |
| @staticmethod | |
| def _make_detail_url(href: str) -> str: | |
| parsed_url = urlparse(Config.BASE_URL + href) | |
| return parsed_url._replace(query="").geturl() | |
| def __aiter__(self): | |
| return self | |
| async def __anext__(self) -> Sequence[str]: | |
| if self.current > self.end: | |
| raise StopAsyncIteration | |
| try: | |
| headers = _create_headers() | |
| url = self._make_search_url(self.current) | |
| response = await Config.HTTP_CLIENT.get(url, headers=headers) | |
| response.raise_for_status() | |
| # Extract hrefs from the response HTML | |
| hrefs: List[str] = [] | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| for result_item in soup.find_all("li", {"class": "result-item"}): | |
| href: str = result_item.find("h2", {"class": "title"}).find("a")["href"] # type: ignore | |
| hrefs.append(self._make_detail_url(href)) | |
| self.current += 1 | |
| return hrefs | |
| except Exception as e: | |
| raise FetchError(f"Error fetching hrefs from URL '{url}': {e}", url) from e | |
| class CentreDetailsIterator: | |
| """ | |
| To be used as: | |
| >>> async for centre_details in CentreDetailsIterator(centre_links): ... | |
| Where centre_details is a RecyclingCentre object with the details of a centre, | |
| and centre_links is a list of strings where each string is the URL to the | |
| "detail" page of a centre: which you can get from CentreURLsIterator. | |
| """ | |
| def __init__(self, centre_links: Sequence[str]): | |
| self.centre_links = centre_links | |
| self._idx = 0 | |
| self._fetched = False | |
| self._items: Sequence[Union[RecyclingCentre, BaseException]] = [] | |
| @staticmethod | |
| async def _get_centre_details(url: str) -> RecyclingCentre: | |
| headers = _create_headers() | |
| try: | |
| response = await Config.HTTP_CLIENT.get(url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| detail_body = soup.find("div", {"id": "search-details"}) | |
| if detail_body is None: | |
| raise ValueError(f"Could not find details for URL: {url}") | |
| header = detail_body.find("h1") # type: ignore | |
| centre_name: str = list(header.children)[0].text # type: ignore | |
| last_update_date: str = header.find("span", {"class": "last-verified"}).text.strip().split()[-1] # type: ignore | |
| materials_accepted = detail_body.find_all("span", {"class": "material"}) # type: ignore | |
| materials_accepted = ", ".join(material.text.strip() for material in materials_accepted) | |
| address = " ".join( | |
| sub_address.text.strip() | |
| for sub_address in detail_body.find("div", {"class": "contact"}).find_all("p", {"class": "addr"}) # type: ignore | |
| ) | |
| return { | |
| "url": url, | |
| "street_address": address, | |
| "business_name": centre_name, | |
| "last_update_date": last_update_date, | |
| "materials_accepted": materials_accepted, | |
| } | |
| except Exception as e: | |
| raise FetchError(f"Error fetching details from '{url}': {e}", url) from e | |
| async def get_all_details(self) -> Sequence[Union[RecyclingCentre, BaseException]]: | |
| """ | |
| Fetches all details of the centres in the centre_links list. | |
| """ | |
| if self._fetched: | |
| return self._items | |
| tasks = [self._get_centre_details(link) for link in self.centre_links] | |
| self._items = await asyncio.gather(*tasks, return_exceptions=True) | |
| self._fetched = True | |
| return self._items | |
| def __aiter__(self): | |
| return self | |
| async def __anext__(self) -> RecyclingCentre: | |
| if not self._fetched: | |
| await self.get_all_details() | |
| try: | |
| centre = self._items[self._idx] | |
| self._idx += 1 | |
| if isinstance(centre, BaseException): | |
| assert isinstance(centre, FetchError), "Iterators should only throw FetchErrors" | |
| return { | |
| "url": centre.url, | |
| "street_address": "", | |
| "last_update_date": "", | |
| "materials_accepted": "", | |
| "business_name": str(centre), | |
| } | |
| return centre | |
| except IndexError: | |
| raise StopAsyncIteration | |
| async def main() -> int: | |
| parser = _create_argparser() | |
| args = parser.parse_args() | |
| outfile = args.outfile | |
| writer = _create_writer(outfile) | |
| number_of_pages: int = args.number | |
| async with httpx.AsyncClient(timeout=None) as client: | |
| Config.HTTP_CLIENT = client | |
| async for centre_links in CentreURLsIterator(1, number_of_pages): | |
| async for centre_details in CentreDetailsIterator(centre_links): | |
| writer.writerow(centre_details) | |
| outfile.flush() | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(asyncio.run(main())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment