mentix02 · August 5, 2025 09:31
diff --git a/earth911.py b/earth911.py
 #!/usr/bin/env python3
 """
 File: earth911.py
 Author: mentix02 (Manan [email protected])
 Task: Scrape search results of search.earch911.com and extract data into a CSV file.

 Making a request to search.earth911.com by spoofing the headers to make it look like you're coming from a
 browser. We are returned with plain HTML - no JSON API to reverse engineer unfortunately. Parsing this HTML,
 we get a list of hrefs over which we loop (asynchronously, of course) and fetch details of the individual
 centre (in HTML - from which we extract a dictionary of properties).

 These are, in turn, written to an output CSV file (either stdout or a file provided via sys.argv).

 Design Model: sources and sinks

 Sources: list of hrefs + original link with pagination
 Sinks: consumer of href to fetch centre details + CSV writer writing final output

 1. Fetch all hrefs from `n` pages; where n is an int provided by user (default: 1) (max: 20)
 2. Loops over all these hrefs and makes requests to each (concurrently)
 3. Extracts data from HTML of individual centre page
 4. Writes data to outfile (CSV or stdout) (default: stdout)
 """

 import sys
 import csv
 import random
 import asyncio
 import argparse
 from urllib.parse import urlencode, urlparse
 from typing import List, Dict, Union, Optional, TypedDict, Sequence, TextIO

 import httpx
 from bs4 import BeautifulSoup


 __version__ = "0.0.1"
 __author__ = "mentix02 (Manan [email protected])"

 # Types


 class RecyclingCentre(TypedDict):
    url: str
    business_name: str
    street_address: str
    last_update_date: str
    materials_accepted: str


 class FetchError(Exception):
    """
    Exception raised when there is an error fetching (or parsing) data from a URL.
    """

    def __init__(self, message: str, url: str):
        super().__init__(message)
        self.url = url


 # Config Singleton


 class Config:

    HTTP_CLIENT: httpx.AsyncClient
    BASE_URL: str = "https://search.earth911.com"
    USER_AGENTS: List[str] = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
    ]
    URL_PARAMS: Dict[str, str] = {
        "what": "Electronics",
        "where": "10001",
        "list_filter": "all",
        "max_distance": "100",
    }


 # Private functions


 def _create_headers() -> Dict[str, str]:
    """
    Randomize the User-Agent to avoid detection.
    """
    return {"User-Agent": random.choice(Config.USER_AGENTS)}


 def _create_argparser() -> argparse.ArgumentParser:

    parser = argparse.ArgumentParser(
        prog="earth911",
        description="Fetches data from earth911.com",
        epilog=f"Created by {__author__}",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
    parser.add_argument("-n", "--number", type=int, help="Number of pages to scrape", default=3)

    parser.add_argument(
        "outfile",
        nargs="?",
        type=argparse.FileType("w+"),
        default=sys.stdout,
        help="CSF file to write output",
    )

    return parser


 def _create_writer(outfile: TextIO) -> csv.DictWriter:
    writer = csv.DictWriter(
        outfile,
        fieldnames=[
            "business_name",
            "street_address",
            "last_update_date",
            "materials_accepted",
            "url",
        ],
    )
    writer.writeheader()
    return writer


 # Async Iterators


 class CentreURLsIterator:
    """
    To be used as:

        >>> aysnc for centre_links in CentreURLsIterator(1): ...
        >>> async for centre_links in CentreURLsIterator(1, 15): ...

    Where centre_links is a list of strings where each string is the URL to the
    "detail" page of a centre: from which we can find / extract more info: by
    making a request and parsing the response of each link.

    Note: the `start` & 'end' params expect the `page` argument from the URL's query string:
          so the behaviour for passing 0 to `start` is undefined.
    """

    def __init__(self, start: int, end: Optional[int] = None):
        self.current = start
        self.end = end if end is not None else start

    @staticmethod
    def _make_search_url(page: int = 1) -> str:
        query_string = urlencode(Config.URL_PARAMS | {"page": page})
        return f"{Config.BASE_URL}?{query_string}"

    @staticmethod
    def _make_detail_url(href: str) -> str:
        parsed_url = urlparse(Config.BASE_URL + href)
        return parsed_url._replace(query="").geturl()

    def __aiter__(self):
        return self

    async def __anext__(self) -> Sequence[str]:
        if self.current > self.end:
            raise StopAsyncIteration

        try:
            headers = _create_headers()
            url = self._make_search_url(self.current)

            response = await Config.HTTP_CLIENT.get(url, headers=headers)
            response.raise_for_status()

            # Extract hrefs from the response HTML
            hrefs: List[str] = []
            soup = BeautifulSoup(response.text, "html.parser")

            for result_item in soup.find_all("li", {"class": "result-item"}):
                href: str = result_item.find("h2", {"class": "title"}).find("a")["href"]  # type: ignore
                hrefs.append(self._make_detail_url(href))

            self.current += 1
            return hrefs
        except Exception as e:
            raise FetchError(f"Error fetching hrefs from URL '{url}': {e}", url) from e


 class CentreDetailsIterator:
    """
    To be used as:

        >>> async for centre_details in CentreDetailsIterator(centre_links): ...

    Where centre_details is a RecyclingCentre object with the details of a centre,
    and centre_links is a list of strings where each string is the URL to the
    "detail" page of a centre: which you can get from CentreURLsIterator.
    """

    def __init__(self, centre_links: Sequence[str]):
        self.centre_links = centre_links

        self._idx = 0
        self._fetched = False
        self._items: Sequence[Union[RecyclingCentre, BaseException]] = []

    @staticmethod
    async def _get_centre_details(url: str) -> RecyclingCentre:

        headers = _create_headers()

        try:

            response = await Config.HTTP_CLIENT.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")

            detail_body = soup.find("div", {"id": "search-details"})

            if detail_body is None:
                raise ValueError(f"Could not find details for URL: {url}")

            header = detail_body.find("h1")  # type: ignore

            centre_name: str = list(header.children)[0].text  # type: ignore
            last_update_date: str = header.find("span", {"class": "last-verified"}).text.strip().split()[-1]  # type: ignore

            materials_accepted = detail_body.find_all("span", {"class": "material"})  # type: ignore
            materials_accepted = ", ".join(material.text.strip() for material in materials_accepted)

            address = " ".join(
                sub_address.text.strip()
                for sub_address in detail_body.find("div", {"class": "contact"}).find_all("p", {"class": "addr"})  # type: ignore
            )

            return {
                "url": url,
                "street_address": address,
                "business_name": centre_name,
                "last_update_date": last_update_date,
                "materials_accepted": materials_accepted,
            }
        except Exception as e:
            raise FetchError(f"Error fetching details from '{url}': {e}", url) from e

    async def get_all_details(self) -> Sequence[Union[RecyclingCentre, BaseException]]:
        """
        Fetches all details of the centres in the centre_links list.
        """

        if self._fetched:
            return self._items

        tasks = [self._get_centre_details(link) for link in self.centre_links]
        self._items = await asyncio.gather(*tasks, return_exceptions=True)
        self._fetched = True

        return self._items

    def __aiter__(self):
        return self

    async def __anext__(self) -> RecyclingCentre:

        if not self._fetched:
            await self.get_all_details()

        try:

            centre = self._items[self._idx]
            self._idx += 1

            if isinstance(centre, BaseException):
                assert isinstance(centre, FetchError), "Iterators should only throw FetchErrors"
                return {
                    "url": centre.url,
                    "street_address": "",
                    "last_update_date": "",
                    "materials_accepted": "",
                    "business_name": str(centre),
                }

            return centre

        except IndexError:
            raise StopAsyncIteration


 async def main() -> int:

    parser = _create_argparser()
    args = parser.parse_args()

    outfile = args.outfile
    writer = _create_writer(outfile)
    number_of_pages: int = args.number

    async with httpx.AsyncClient(timeout=None) as client:
        Config.HTTP_CLIENT = client

        async for centre_links in CentreURLsIterator(1, number_of_pages):
            async for centre_details in CentreDetailsIterator(centre_links):
                writer.writerow(centre_details)

    outfile.flush()
    return 0


 if __name__ == "__main__":
    sys.exit(asyncio.run(main()))
	#!/usr/bin/env python3
	"""
	File: earth911.py
	Author: mentix02 (Manan [email protected])
	Task: Scrape search results of search.earch911.com and extract data into a CSV file.

	Making a request to search.earth911.com by spoofing the headers to make it look like you're coming from a
	browser. We are returned with plain HTML - no JSON API to reverse engineer unfortunately. Parsing this HTML,
	we get a list of hrefs over which we loop (asynchronously, of course) and fetch details of the individual
	centre (in HTML - from which we extract a dictionary of properties).

	These are, in turn, written to an output CSV file (either stdout or a file provided via sys.argv).

	Design Model: sources and sinks

	Sources: list of hrefs + original link with pagination
	Sinks: consumer of href to fetch centre details + CSV writer writing final output

	1. Fetch all hrefs from `n` pages; where n is an int provided by user (default: 1) (max: 20)
	2. Loops over all these hrefs and makes requests to each (concurrently)
	3. Extracts data from HTML of individual centre page
	4. Writes data to outfile (CSV or stdout) (default: stdout)
	"""

	import sys
	import csv
	import random
	import asyncio
	import argparse
	from urllib.parse import urlencode, urlparse
	from typing import List, Dict, Union, Optional, TypedDict, Sequence, TextIO

	import httpx
	from bs4 import BeautifulSoup


	__version__ = "0.0.1"
	__author__ = "mentix02 (Manan [email protected])"

	# Types


	class RecyclingCentre(TypedDict):
	url: str
	business_name: str
	street_address: str
	last_update_date: str
	materials_accepted: str


	class FetchError(Exception):
	"""
	Exception raised when there is an error fetching (or parsing) data from a URL.
	"""

	def __init__(self, message: str, url: str):
	super().__init__(message)
	self.url = url


	# Config Singleton


	class Config:

	HTTP_CLIENT: httpx.AsyncClient
	BASE_URL: str = "https://search.earth911.com"
	USER_AGENTS: List[str] = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
	]
	URL_PARAMS: Dict[str, str] = {
	"what": "Electronics",
	"where": "10001",
	"list_filter": "all",
	"max_distance": "100",
	}


	# Private functions


	def _create_headers() -> Dict[str, str]:
	"""
	Randomize the User-Agent to avoid detection.
	"""
	return {"User-Agent": random.choice(Config.USER_AGENTS)}


	def _create_argparser() -> argparse.ArgumentParser:

	parser = argparse.ArgumentParser(
	prog="earth911",
	description="Fetches data from earth911.com",
	epilog=f"Created by {__author__}",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)

	parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
	parser.add_argument("-n", "--number", type=int, help="Number of pages to scrape", default=3)

	parser.add_argument(
	"outfile",
	nargs="?",
	type=argparse.FileType("w+"),
	default=sys.stdout,
	help="CSF file to write output",
	)

	return parser


	def _create_writer(outfile: TextIO) -> csv.DictWriter:
	writer = csv.DictWriter(
	outfile,
	fieldnames=[
	"business_name",
	"street_address",
	"last_update_date",
	"materials_accepted",
	"url",
	],
	)
	writer.writeheader()
	return writer


	# Async Iterators


	class CentreURLsIterator:
	"""
	To be used as:

	>>> aysnc for centre_links in CentreURLsIterator(1): ...
	>>> async for centre_links in CentreURLsIterator(1, 15): ...

	Where centre_links is a list of strings where each string is the URL to the
	"detail" page of a centre: from which we can find / extract more info: by
	making a request and parsing the response of each link.

	Note: the `start` & 'end' params expect the `page` argument from the URL's query string:
	so the behaviour for passing 0 to `start` is undefined.
	"""

	def __init__(self, start: int, end: Optional[int] = None):
	self.current = start
	self.end = end if end is not None else start

	@staticmethod
	def _make_search_url(page: int = 1) -> str:
	query_string = urlencode(Config.URL_PARAMS \| {"page": page})
	return f"{Config.BASE_URL}?{query_string}"

	@staticmethod
	def _make_detail_url(href: str) -> str:
	parsed_url = urlparse(Config.BASE_URL + href)
	return parsed_url._replace(query="").geturl()

	def __aiter__(self):
	return self

	async def __anext__(self) -> Sequence[str]:
	if self.current > self.end:
	raise StopAsyncIteration

	try:
	headers = _create_headers()
	url = self._make_search_url(self.current)

	response = await Config.HTTP_CLIENT.get(url, headers=headers)
	response.raise_for_status()

	# Extract hrefs from the response HTML
	hrefs: List[str] = []
	soup = BeautifulSoup(response.text, "html.parser")

	for result_item in soup.find_all("li", {"class": "result-item"}):
	href: str = result_item.find("h2", {"class": "title"}).find("a")["href"] # type: ignore
	hrefs.append(self._make_detail_url(href))

	self.current += 1
	return hrefs
	except Exception as e:
	raise FetchError(f"Error fetching hrefs from URL '{url}': {e}", url) from e


	class CentreDetailsIterator:
	"""
	To be used as:

	>>> async for centre_details in CentreDetailsIterator(centre_links): ...

	Where centre_details is a RecyclingCentre object with the details of a centre,
	and centre_links is a list of strings where each string is the URL to the
	"detail" page of a centre: which you can get from CentreURLsIterator.
	"""

	def __init__(self, centre_links: Sequence[str]):
	self.centre_links = centre_links

	self._idx = 0
	self._fetched = False
	self._items: Sequence[Union[RecyclingCentre, BaseException]] = []

	@staticmethod
	async def _get_centre_details(url: str) -> RecyclingCentre:

	headers = _create_headers()

	try:

	response = await Config.HTTP_CLIENT.get(url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")

	detail_body = soup.find("div", {"id": "search-details"})

	if detail_body is None:
	raise ValueError(f"Could not find details for URL: {url}")

	header = detail_body.find("h1") # type: ignore

	centre_name: str = list(header.children)[0].text # type: ignore
	last_update_date: str = header.find("span", {"class": "last-verified"}).text.strip().split()[-1] # type: ignore

	materials_accepted = detail_body.find_all("span", {"class": "material"}) # type: ignore
	materials_accepted = ", ".join(material.text.strip() for material in materials_accepted)

	address = " ".join(
	sub_address.text.strip()
	for sub_address in detail_body.find("div", {"class": "contact"}).find_all("p", {"class": "addr"}) # type: ignore
	)

	return {
	"url": url,
	"street_address": address,
	"business_name": centre_name,
	"last_update_date": last_update_date,
	"materials_accepted": materials_accepted,
	}
	except Exception as e:
	raise FetchError(f"Error fetching details from '{url}': {e}", url) from e

	async def get_all_details(self) -> Sequence[Union[RecyclingCentre, BaseException]]:
	"""
	Fetches all details of the centres in the centre_links list.
	"""

	if self._fetched:
	return self._items

	tasks = [self._get_centre_details(link) for link in self.centre_links]
	self._items = await asyncio.gather(*tasks, return_exceptions=True)
	self._fetched = True

	return self._items

	def __aiter__(self):
	return self

	async def __anext__(self) -> RecyclingCentre:

	if not self._fetched:
	await self.get_all_details()

	try:

	centre = self._items[self._idx]
	self._idx += 1

	if isinstance(centre, BaseException):
	assert isinstance(centre, FetchError), "Iterators should only throw FetchErrors"
	return {
	"url": centre.url,
	"street_address": "",
	"last_update_date": "",
	"materials_accepted": "",
	"business_name": str(centre),
	}

	return centre

	except IndexError:
	raise StopAsyncIteration


	async def main() -> int:

	parser = _create_argparser()
	args = parser.parse_args()

	outfile = args.outfile
	writer = _create_writer(outfile)
	number_of_pages: int = args.number

	async with httpx.AsyncClient(timeout=None) as client:
	Config.HTTP_CLIENT = client

	async for centre_links in CentreURLsIterator(1, number_of_pages):
	async for centre_details in CentreDetailsIterator(centre_links):
	writer.writerow(centre_details)

	outfile.flush()
	return 0


	if __name__ == "__main__":
	sys.exit(asyncio.run(main()))
No results found