tswicegood · November 17, 2021 23:22
diff --git a/environment.yml b/environment.yml
 # Conda environment file -- these packages are required for this script to work
 name: craigslist
 dependencies:
 - lxml
 - pip
 - python=3.4*
 - requests
 - pip:
  - aiohttp
  - pyquery
diff --git a/scraper.py b/scraper.py
 """
 This script lets you scan all of the listings for bikes on a collection of
 Craiglist postings to look for matching phone numbers.  Note, this should be
 used at your own risk, et cetra, and so on.

 To use this, create a directory that contains saved files from Craigslist's
 main listing (or search results) for the bike category.   The default is to
 use ~/Desktop/bikes/.

 Once you've gathered the necessary saved data, you can run this script with
 the following command:

    PHONE_NUMBER=5553334444 python scraper.py

 Replace the value for PHONE_NUMBER with whatever you are searching for.  I had
 a situation where I knew a stolen bike had been listed with a given phone
 number, so that's what this searches.  The script could be modified to look at
 other values as well.
 """

 import aiohttp
 import asyncio
 import os
 import sys

 from pyquery import PyQuery as pq
 sem = asyncio.Semaphore(2)

 """
 ## Configuration

 These values can be configured via environment variables.  It's required that
 you include PHONE_NUMBER, but other values include:

 * CURRENT_CITY -- the name of the Craigslist city you want to search
 * HTML_DIR -- the location of the saved HTML files
 """
 SEARCH_NUMBER = os.getenv("PHONE_NUMBER", False)
 if SEARCH_NUMBER is False:
    sys.stderr.write("Unable to search without a phone number.\n\n")
    sys.stderr.write("Please re-run like this:\n")
    sys.stderr.write("    PHONE_NUMBER=5553334444 python scraper.py\n\n")
    sys.exit(-1)
 CURRENT_CITY = os.getenv("CURRENT_CITY", "losangeles")
 BASE_CRAIGSLIST = "http://{city}.craigslist.org".format(city=CURRENT_CITY)
 BIKE_TEMPLATE = "{url}/search/sgv/bik".format(url=BASE_CRAIGSLIST)
 DIRECTORY = os.getenv("HTML_DIR", os.path.expanduser("~/Desktop/bikes"))


 @asyncio.coroutine
 def get(url):
    response = yield from aiohttp.request('GET', url)
    return (yield from response.read_and_close())


 def out(s):
    sys.stdout.write(s)
    sys.stdout.flush()


 def find_bikes():
    files = os.listdir(DIRECTORY)
    print("Checking %d files" % len(files))
    for file in files:
        with open(os.path.join(DIRECTORY, file)) as f:
            doc = pq(f.read())
        possible_links = doc.find(".content .row a.hdrlnk")
        for link in possible_links:
            del link.attrib["class"]
            link.attrib["name"] = link.text.strip()
            yield link.attrib


 @asyncio.coroutine
 def lookup_reply(bike):
    url = "{base}/reply/lax/bik/{id}".format(base=BASE_CRAIGSLIST,
                                             id=bike["data-id"])
    with (yield from sem):
        page = yield from get(url)
    doc = pq(page)

    try:
        phone = doc.find(".reply_options > ul")[2].find("li")
    except IndexError:
        out("-")
        return False
    if phone.text is None:
        out("-")
        return False
    phone = (phone.text[1:]  # Strip off the telephone
             .strip()   # Clear whitespce
             .replace("-", ""))  # Make it a solid number
    out(".")
    if phone == SEARCH_NUMBER:
        out("!X!")
        return bike
    return False


 @asyncio.coroutine
 def process_bikes(bikes):
    matches = []
    coroutines = [lookup_reply(bike) for bike in bikes]
    for coroutine in asyncio.as_completed(coroutines):
        result = yield from coroutine
        if result is False:
            continue
        matches.append(result)

    print()
    print("Found %d matches" % len(matches))
    print("-" * 80)
    for match in matches:
        print(match)


 if __name__ == "__main__":
    bikes = find_bikes()
    try:
        loop = asyncio.get_event_loop()
        loop.run_until_complete(process_bikes(bikes))
    except KeyboardInterrupt:
        print("Ctrl+C caught, stopping")
        sys.exit(0)
	# Conda environment file -- these packages are required for this script to work
	name: craigslist
	dependencies:
	- lxml
	- pip
	- python=3.4*
	- requests
	- pip:
	- aiohttp
	- pyquery
	"""
	This script lets you scan all of the listings for bikes on a collection of
	Craiglist postings to look for matching phone numbers. Note, this should be
	used at your own risk, et cetra, and so on.

	To use this, create a directory that contains saved files from Craigslist's
	main listing (or search results) for the bike category. The default is to
	use ~/Desktop/bikes/.

	Once you've gathered the necessary saved data, you can run this script with
	the following command:

	PHONE_NUMBER=5553334444 python scraper.py

	Replace the value for PHONE_NUMBER with whatever you are searching for. I had
	a situation where I knew a stolen bike had been listed with a given phone
	number, so that's what this searches. The script could be modified to look at
	other values as well.
	"""

	import aiohttp
	import asyncio
	import os
	import sys

	from pyquery import PyQuery as pq
	sem = asyncio.Semaphore(2)

	"""
	## Configuration

	These values can be configured via environment variables. It's required that
	you include PHONE_NUMBER, but other values include:

	* CURRENT_CITY -- the name of the Craigslist city you want to search
	* HTML_DIR -- the location of the saved HTML files
	"""
	SEARCH_NUMBER = os.getenv("PHONE_NUMBER", False)
	if SEARCH_NUMBER is False:
	sys.stderr.write("Unable to search without a phone number.\n\n")
	sys.stderr.write("Please re-run like this:\n")
	sys.stderr.write(" PHONE_NUMBER=5553334444 python scraper.py\n\n")
	sys.exit(-1)
	CURRENT_CITY = os.getenv("CURRENT_CITY", "losangeles")
	BASE_CRAIGSLIST = "http://{city}.craigslist.org".format(city=CURRENT_CITY)
	BIKE_TEMPLATE = "{url}/search/sgv/bik".format(url=BASE_CRAIGSLIST)
	DIRECTORY = os.getenv("HTML_DIR", os.path.expanduser("~/Desktop/bikes"))


	@asyncio.coroutine
	def get(url):
	response = yield from aiohttp.request('GET', url)
	return (yield from response.read_and_close())


	def out(s):
	sys.stdout.write(s)
	sys.stdout.flush()


	def find_bikes():
	files = os.listdir(DIRECTORY)
	print("Checking %d files" % len(files))
	for file in files:
	with open(os.path.join(DIRECTORY, file)) as f:
	doc = pq(f.read())
	possible_links = doc.find(".content .row a.hdrlnk")
	for link in possible_links:
	del link.attrib["class"]
	link.attrib["name"] = link.text.strip()
	yield link.attrib


	@asyncio.coroutine
	def lookup_reply(bike):
	url = "{base}/reply/lax/bik/{id}".format(base=BASE_CRAIGSLIST,
	id=bike["data-id"])
	with (yield from sem):
	page = yield from get(url)
	doc = pq(page)

	try:
	phone = doc.find(".reply_options > ul")[2].find("li")
	except IndexError:
	out("-")
	return False
	if phone.text is None:
	out("-")
	return False
	phone = (phone.text[1:] # Strip off the telephone
	.strip() # Clear whitespce
	.replace("-", "")) # Make it a solid number
	out(".")
	if phone == SEARCH_NUMBER:
	out("!X!")
	return bike
	return False


	@asyncio.coroutine
	def process_bikes(bikes):
	matches = []
	coroutines = [lookup_reply(bike) for bike in bikes]
	for coroutine in asyncio.as_completed(coroutines):
	result = yield from coroutine
	if result is False:
	continue
	matches.append(result)

	print()
	print("Found %d matches" % len(matches))
	print("-" * 80)
	for match in matches:
	print(match)


	if __name__ == "__main__":
	bikes = find_bikes()
	try:
	loop = asyncio.get_event_loop()
	loop.run_until_complete(process_bikes(bikes))
	except KeyboardInterrupt:
	print("Ctrl+C caught, stopping")
	sys.exit(0)