-
-
Save SivaArwin/44ea4687b71abd11e16e021ce38735f7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def page_scrape(): | |
"""This function takes care of the scraping part""" | |
xp_sections = '//*[@class="section duration"]' | |
sections = driver.find_elements_by_xpath(xp_sections) | |
sections_list = [value.text for value in sections] | |
section_a_list = sections_list[::2] # This is to separate the two flights | |
section_b_list = sections_list[1::2] # This is to separate the two flights | |
# if you run into a reCaptcha, you might want to do something about it | |
# you will know there's a problem if the lists above are empty | |
# this if statement lets you exit the bot or do something else | |
# you can add a sleep here, to let you solve the captcha and continue scraping | |
# i'm using a SystemExit because i want to test everything from the start | |
if section_a_list == []: | |
raise SystemExit | |
# I'll use the letter A for the outbound flight and B for the inbound | |
a_duration = [] | |
a_section_names = [] | |
for n in section_a_list: | |
# Separate the time from the cities | |
a_section_names.append(''.join(n.split()[2:5])) | |
a_duration.append(''.join(n.split()[0:2])) | |
b_duration = [] | |
b_section_names = [] | |
for n in section_b_list: | |
# Separate the time from the cities | |
b_section_names.append(''.join(n.split()[2:5])) | |
b_duration.append(''.join(n.split()[0:2])) | |
xp_dates = '//div[@class="section date"]' | |
dates = driver.find_elements_by_xpath(xp_dates) | |
dates_list = [value.text for value in dates] | |
a_date_list = dates_list[::2] | |
b_date_list = dates_list[1::2] | |
# Separating the weekday from the day | |
a_day = [value.split()[0] for value in a_date_list] | |
a_weekday = [value.split()[1] for value in a_date_list] | |
b_day = [value.split()[0] for value in b_date_list] | |
b_weekday = [value.split()[1] for value in b_date_list] | |
# getting the prices | |
xp_prices = '//a[@class="booking-link"]/span[@class="price option-text"]' | |
prices = driver.find_elements_by_xpath(xp_prices) | |
prices_list = [price.text.replace('$','') for price in prices if price.text != ''] | |
prices_list = list(map(int, prices_list)) | |
# the stops are a big list with one leg on the even index and second leg on odd index | |
xp_stops = '//div[@class="section stops"]/div[1]' | |
stops = driver.find_elements_by_xpath(xp_stops) | |
stops_list = [stop.text[0].replace('n','0') for stop in stops] | |
a_stop_list = stops_list[::2] | |
b_stop_list = stops_list[1::2] | |
xp_stops_cities = '//div[@class="section stops"]/div[2]' | |
stops_cities = driver.find_elements_by_xpath(xp_stops_cities) | |
stops_cities_list = [stop.text for stop in stops_cities] | |
a_stop_name_list = stops_cities_list[::2] | |
b_stop_name_list = stops_cities_list[1::2] | |
# this part gets me the airline company and the departure and arrival times, for both legs | |
xp_schedule = '//div[@class="section times"]' | |
schedules = driver.find_elements_by_xpath(xp_schedule) | |
hours_list = [] | |
carrier_list = [] | |
for schedule in schedules: | |
hours_list.append(schedule.text.split('\n')[0]) | |
carrier_list.append(schedule.text.split('\n')[1]) | |
# split the hours and carriers, between a and b legs | |
a_hours = hours_list[::2] | |
a_carrier = carrier_list[::2] | |
b_hours = hours_list[1::2] | |
b_carrier = carrier_list[1::2] | |
cols = (['Out Day', 'Out Time', 'Out Weekday', 'Out Airline', 'Out Cities', 'Out Duration', 'Out Stops', 'Out Stop Cities', | |
'Return Day', 'Return Time', 'Return Weekday', 'Return Airline', 'Return Cities', 'Return Duration', 'Return Stops', 'Return Stop Cities', | |
'Price']) | |
flights_df = pd.DataFrame({'Out Day': a_day, | |
'Out Weekday': a_weekday, | |
'Out Duration': a_duration, | |
'Out Cities': a_section_names, | |
'Return Day': b_day, | |
'Return Weekday': b_weekday, | |
'Return Duration': b_duration, | |
'Return Cities': b_section_names, | |
'Out Stops': a_stop_list, | |
'Out Stop Cities': a_stop_name_list, | |
'Return Stops': b_stop_list, | |
'Return Stop Cities': b_stop_name_list, | |
'Out Time': a_hours, | |
'Out Airline': a_carrier, | |
'Return Time': b_hours, | |
'Return Airline': b_carrier, | |
'Price': prices_list})[cols] | |
flights_df['timestamp'] = strftime("%Y%m%d-%H%M") # so we can know when it was scraped | |
return flights_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment