Last active
November 17, 2021 23:22
-
-
Save tswicegood/90ab3e4915d23c312089 to your computer and use it in GitHub Desktop.
Simple scraper for looking through a bunch of saved Craigslist listings for a phone number
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Conda environment file -- these packages are required for this script to work | |
name: craigslist | |
dependencies: | |
- lxml | |
- pip | |
- python=3.4* | |
- requests | |
- pip: | |
- aiohttp | |
- pyquery |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script lets you scan all of the listings for bikes on a collection of | |
Craiglist postings to look for matching phone numbers. Note, this should be | |
used at your own risk, et cetra, and so on. | |
To use this, create a directory that contains saved files from Craigslist's | |
main listing (or search results) for the bike category. The default is to | |
use ~/Desktop/bikes/. | |
Once you've gathered the necessary saved data, you can run this script with | |
the following command: | |
PHONE_NUMBER=5553334444 python scraper.py | |
Replace the value for PHONE_NUMBER with whatever you are searching for. I had | |
a situation where I knew a stolen bike had been listed with a given phone | |
number, so that's what this searches. The script could be modified to look at | |
other values as well. | |
""" | |
import aiohttp | |
import asyncio | |
import os | |
import sys | |
from pyquery import PyQuery as pq | |
sem = asyncio.Semaphore(2) | |
""" | |
## Configuration | |
These values can be configured via environment variables. It's required that | |
you include PHONE_NUMBER, but other values include: | |
* CURRENT_CITY -- the name of the Craigslist city you want to search | |
* HTML_DIR -- the location of the saved HTML files | |
""" | |
SEARCH_NUMBER = os.getenv("PHONE_NUMBER", False) | |
if SEARCH_NUMBER is False: | |
sys.stderr.write("Unable to search without a phone number.\n\n") | |
sys.stderr.write("Please re-run like this:\n") | |
sys.stderr.write(" PHONE_NUMBER=5553334444 python scraper.py\n\n") | |
sys.exit(-1) | |
CURRENT_CITY = os.getenv("CURRENT_CITY", "losangeles") | |
BASE_CRAIGSLIST = "http://{city}.craigslist.org".format(city=CURRENT_CITY) | |
BIKE_TEMPLATE = "{url}/search/sgv/bik".format(url=BASE_CRAIGSLIST) | |
DIRECTORY = os.getenv("HTML_DIR", os.path.expanduser("~/Desktop/bikes")) | |
@asyncio.coroutine | |
def get(url): | |
response = yield from aiohttp.request('GET', url) | |
return (yield from response.read_and_close()) | |
def out(s): | |
sys.stdout.write(s) | |
sys.stdout.flush() | |
def find_bikes(): | |
files = os.listdir(DIRECTORY) | |
print("Checking %d files" % len(files)) | |
for file in files: | |
with open(os.path.join(DIRECTORY, file)) as f: | |
doc = pq(f.read()) | |
possible_links = doc.find(".content .row a.hdrlnk") | |
for link in possible_links: | |
del link.attrib["class"] | |
link.attrib["name"] = link.text.strip() | |
yield link.attrib | |
@asyncio.coroutine | |
def lookup_reply(bike): | |
url = "{base}/reply/lax/bik/{id}".format(base=BASE_CRAIGSLIST, | |
id=bike["data-id"]) | |
with (yield from sem): | |
page = yield from get(url) | |
doc = pq(page) | |
try: | |
phone = doc.find(".reply_options > ul")[2].find("li") | |
except IndexError: | |
out("-") | |
return False | |
if phone.text is None: | |
out("-") | |
return False | |
phone = (phone.text[1:] # Strip off the telephone | |
.strip() # Clear whitespce | |
.replace("-", "")) # Make it a solid number | |
out(".") | |
if phone == SEARCH_NUMBER: | |
out("!X!") | |
return bike | |
return False | |
@asyncio.coroutine | |
def process_bikes(bikes): | |
matches = [] | |
coroutines = [lookup_reply(bike) for bike in bikes] | |
for coroutine in asyncio.as_completed(coroutines): | |
result = yield from coroutine | |
if result is False: | |
continue | |
matches.append(result) | |
print() | |
print("Found %d matches" % len(matches)) | |
print("-" * 80) | |
for match in matches: | |
print(match) | |
if __name__ == "__main__": | |
bikes = find_bikes() | |
try: | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(process_bikes(bikes)) | |
except KeyboardInterrupt: | |
print("Ctrl+C caught, stopping") | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment