Created
June 14, 2020 20:54
-
-
Save BHushanRathod/d7942229914b04dfa7fb076efda011fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
import random | |
import time | |
file1 = open("Hotel_List.txt", "w") | |
headers = { | |
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*', | |
'Accept-Encoding': 'gzip,deflate', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive', | |
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', | |
'Pragma': 'no-cache', | |
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0', | |
'X-Requested-With': 'XMLHttpRequest' | |
} | |
UAS = ( | |
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", | |
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0", | |
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", | |
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0' | |
) | |
ua = UAS[random.randrange(len(UAS))] | |
headers.update({'User-Agent': ua}) | |
driver = webdriver.Chrome('/Users/bhushan/Downloads/chromedriver') | |
all_url = [] | |
def get_hotel_name(soup): | |
global row | |
for a in soup.findAll('h1', {'class': 'detail-baseinfo_name'}): | |
name = a.get_text().strip() | |
print("Hotel Name: ", name) | |
file1.write("Hotel Name: \t%s\n" % name) | |
print("~" * 50) | |
def get_room_details(soup): | |
""" | |
Method to get all details about specific hotel | |
:return: | |
""" | |
global row | |
for a in soup.findAll('div', {'class': 'roomlist-baseroom-card'}): | |
for b in a.findAll('div', {'class': 'roomname'}): | |
print("Room Name: ", b.get_text().strip()) | |
file1.write("Room Name: \t%s\n" % b.get_text().strip()) | |
for b in a.findAll('div', {'class': 'roomcard'}): | |
for c in b.findAll('div', {'class': 'salecard-flex'}): | |
for d in c.findAll('div', {'class': 'salecard-bedfacility'}): | |
for e in d.findAll('div', {'class': 'facility'}): | |
for f in e.findAll('span', {'class': 'desc-text underline'}): | |
print("Amenities: ", f.get_text().strip()) | |
file1.write("Amenities: \t%s\n" % f.get_text().strip()) | |
for d in c.findAll('div', {'class': 'bed'}): | |
for e in d.findAll('div', {'class': 'bed-content'}): | |
for f in e.findAll('span', {'class': 'underline'}): | |
print("Bed Type: ", f.get_text().strip()) | |
file1.write("Bed Type: \t%s\n" % f.get_text().strip()) | |
for d in c.findAll('div', {'class': 'salecard-price'}): | |
for e in d.findAll('div', {'class': 'salecard-price-panel'}): | |
for f in e.findAll('div', {'class': 'note'}): | |
print("Price: ", f.get_text().strip()) | |
file1.write("Price: \t%s\n" % f.get_text().strip()) | |
print('~' * 100) | |
file1.write('\n') | |
def get_data(): | |
""" | |
Method to get the data from trip.com given chekin_date, checkout_date, city, no_of_people. | |
:return: | |
""" | |
url = 'https://www.trip.com/hotels/list?city=1&countryId=1&checkin=2020/11/18&checkout=2020/11/24&optionId=1' \ | |
'&optionType=City&display=Beijing&crn=1&adult=2&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl' \ | |
'&domestic=1 ' | |
driver.get(url) | |
time.sleep(2) | |
# append each hotel to the list | |
window = [] | |
for i in driver.find_elements_by_class_name('list-card-title'): | |
i.click() | |
window.append(driver.window_handles[-1]) | |
# append the urls of each page to the list | |
for i in window: | |
driver.switch_to.window(i) | |
all_url.append(driver.current_url) | |
html = driver.page_source | |
soup = BeautifulSoup(html, 'lxml') | |
get_hotel_name(soup) | |
get_room_details(soup) | |
if __name__ == '__main__': | |
get_data() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Download and update the path of chromedriver at line https://gist.github.com/BHushanRathod/d7942229914b04dfa7fb076efda011fb#file-trip_scrapper-py-L34.
https://sites.google.com/a/chromium.org/chromedriver/downloads