Last active
April 20, 2019 02:51
-
-
Save Omar-Salem/e936754e19d8060771770433f63582d2 to your computer and use it in GitHub Desktop.
flatemates crawler/filtere
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from haversine import haversine | |
import urllib2 | |
from bs4 import BeautifulSoup | |
def read_data(): | |
with open('data.json', 'r') as data: | |
return data.read() | |
def filter_data(data): | |
center=(-37.806914398397005,144.9416857585693) | |
arr_filtered=[] | |
arr_raw=json.loads(data) | |
for d in arr_raw["matches"]: | |
latitude=d["latitude"] | |
longitude=d["longitude"] | |
place=(latitude,longitude) | |
if haversine(center, place)<=7: | |
arr_filtered.append(d["listing_link"]) | |
return arr_filtered | |
def crawl_listings(listings): | |
arr_filtered=[] | |
for l in listings: | |
url="https://flatmates.com.au"+l | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page, 'html.parser') | |
if 'Free to message' not in str(soup): | |
continue | |
features = soup.findAll("div", {"class": "styles__value___V-EvS styles__iconPresent___2jaGH"}) | |
roommates_div=features[2] | |
roommates_count=roommates_div.getText() | |
if int(roommates_count)>2: | |
continue | |
print(url) | |
print("----------") | |
arr_filtered.append(url) | |
return arr_filtered | |
def write_listings(listings): | |
with open('output.txt', 'w+') as output: | |
for item in listings: | |
output.write(item+'\n') | |
return | |
data=read_data() | |
arr_filtered=filter_data(data) | |
listings=crawl_listings(arr_filtered) | |
write_listings(listings) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment