Created
August 16, 2019 09:26
-
-
Save ttresslar/650f0768b514446711e9b2c766c27e96 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests, re, datetime, urllib.parse | |
from bs4 import BeautifulSoup as bs | |
from multiprocessing.pool import ThreadPool | |
def get_datelist(): | |
base_time = datetime.datetime.today() | |
base_time = base_time.replace(hour=0, second=0, minute=0, microsecond=0) | |
date_list = [base_time + datetime.timedelta(days=x) for x in range(1, 15)] | |
date_list = [unformatted.strftime("%Y-%m-%d") for unformatted in date_list] | |
return date_list | |
print("getting date list") | |
date_list = get_datelist() | |
date_list | |
def get_soup(origin, destination, date): | |
base_url = "https://buupass.com/Booking/search?from={}&to={}&departure_date={}" | |
url = base_url.format(origin, destination, date) | |
c = requests.get(url, "html.parse") | |
c = c.content | |
soup = bs(c,features="lxml") | |
return soup | |
def get_ticket_info(soup): | |
stuff = soup.findAll("article") | |
bus_co = [thing.h4.get_text().split(" - ",3) for thing in stuff] | |
price = [re.findall('\d+',thing.find("span", {"class":"price listprice"}).get_text())[0] for thing in stuff] | |
seats = [re.findall('\d+',thing.find("div", {"class":"action"}).get_text().strip().replace("\n","").replace("SOLD OUT","0")) for thing in stuff if thing] | |
seats = [int(seat[0]) if len(seat)>0 else 0 for seat in seats] | |
route_info = [{"bus_co":bus, "origin":origin, "destination":dest} for origin, dest, bus in bus_co] | |
meta = [{"price":price, "seats_remaining":seat, "created_at": datetime.datetime.today()} for price, seat in zip(price, seats)] | |
info = [thing.find("div", {"class":"time"}) for thing in stuff] | |
headers = [thing.findAll("span", {"class":"skin-color"}) for thing in info] | |
headers = [[head.get_text().strip() for head in header] for header in headers] | |
datas = [thing.findAll("span", {"class":"search_data_values"}) for thing in info] | |
datas = [[datum.get_text().strip() for datum in data] for data in datas] | |
finale = [dict(zip(header, data)) for header, data in zip(headers, datas)] | |
ready = [{**route, **meta, **finale} for route, meta, finale in zip(route_info, meta, finale)] | |
return ready | |
print("First Iteration") | |
df = pd.DataFrame( | |
get_ticket_info( | |
get_soup("Nairobi", "Arusha", date_list[0]) | |
) | |
) | |
df["search_origin"] = "Nairobi" | |
df["search_destination"] = "Arusha" | |
df["search_date"] = date_list[0] | |
def get_places(soup): | |
places = soup.find_all("select") | |
_from = places[0].findAll("option") | |
_from = [place['value'] for place in _from] | |
_to = places[1].findAll("option") | |
_to = [place['value'] for place in _to] | |
_from = [urllib.parse.quote(fro) for fro in _from if fro] | |
_to = [urllib.parse.quote(to) for to in _to if to] | |
_from_to = [[_from, _to] for _from, _to in zip(_from,_to) if _from is not _to] | |
return _from_to | |
_from_to = get_places(get_soup("Nairobi", "Arusha", date_list[0])) | |
_from_to_date = [] | |
for date in date_list: | |
temp = _from_to | |
temp['date'] = date | |
_from_to_date.extend(temp.values.tolist()) | |
csv_name = "./scraped"+datetime.datetime.today().strftime("%Y-%m-%d")+".csv" | |
startTime = datetime.datetime.now() | |
def df_loops(_list): | |
origin, destination, date = _list | |
soup = get_soup(origin,destination,date) | |
new_df = pd.DataFrame(get_ticket_info(soup)) | |
new_df["search_origin"] = origin | |
new_df["search_destination"] = destination | |
new_df["search_date"] = date | |
return new_df | |
print("Starting loop at " + str(startTime)) | |
with ThreadPool(10) as pool: | |
for result in pool.map(df_loops, _from_to_date): | |
df = df.append(result, ignore_index=True, sort=True) | |
df.to_csv(csv_name) | |
print("It took "+str(datetime.datetime.now() - startTime)+" to run this script") | |
df.drop_duplicates(keep="first", inplace=True) | |
#make columns lowercase so that I can easily put them in a database if needed. | |
df.columns = map(str.lower, df.columns) | |
df.to_csv(csv_name) | |
print("Final write of csv") | |
#df.to_sql('buupass', engine, if_exists='append', index=False) | |
#engine.dispose() | |
#print("Saved to DB") | |
print("Finished") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i was searching for web scraping, and i found this gem. i will study it more deeply. im new in python, and i love scrap everything. multithread is a good idea!