|
from lxml import html |
|
from sys import argv |
|
import requests |
|
|
|
WATCH_URL = "http://sfbay.craigslist.org/search/apa?search_distance=1&postal=94720&max_price=3300&bedrooms=2?format=rss" |
|
BASE_URL = "http://sfbay.craigslist.org" |
|
|
|
class Listing(): |
|
|
|
def __init__(self, _id, date, title, price, housing, link): |
|
self._id = _id |
|
self.link = link |
|
self.date = date |
|
self.title = title |
|
self.price = price |
|
self.housing = housing |
|
|
|
def __repr__(self): |
|
return "Listing("+self._id+", "+self.date+", "+self.title+", "+self.price+", "+self.housing+", "+self.link+")" |
|
|
|
def get_listings(watch_url, base_url): |
|
|
|
page = requests.get(WATCH_URL) |
|
tree = html.fromstring(page.content) |
|
|
|
listing_list = tree.xpath('//*[@id="searchform"]/div[4]')[0] |
|
listings_raw = listing_list[1:len(listing_list)-1] |
|
listings = [] |
|
|
|
for listing in listings_raw: |
|
|
|
_id = listing.get("data-pid") |
|
date = listing[1][1][0].get("title") |
|
title = listing[1][1][1][0].text |
|
price = listing[1][2][0].text |
|
housing = listing[1][2][1].text |
|
link = BASE_URL+listing[0].get("href") |
|
|
|
listings.append(Listing(_id, date, title, price, housing, link)) |
|
|
|
return listings |
|
|
|
def write_listings(of): |
|
all_listings = get_listings(WATCH_URL, BASE_URL) |
|
for listing in all_listings: |
|
of.write(listing.title + '\n') |
|
of.write(listing.link + '\n') |
|
of.write(listing.date + '\n') |
|
of.write(listing.price + '\n') |
|
of.write(listing.housing + '\n') |
|
of.write('\n') |
|
|
|
try: |
|
output_file = argv[1] |
|
if output_file: |
|
of = open(output_file, 'a') |
|
write_listings(of) |
|
of.close() |
|
except IndexError: |
|
print("Missing argument: output file name") |
|
print(get_listings(WATCH_URL, BASE_URL)) |