Last active
September 28, 2017 08:04
-
-
Save najibninaba/fd22f805f91f745f980c78ddf739251b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri Jul 28 23:56:40 2017 | |
@author: Dustin Fontaine | |
""" | |
# tripadvisor Scrapper - use this one to scrape hotels | |
# importing libraries | |
from bs4 import BeautifulSoup | |
import urllib | |
import os | |
import urllib.request | |
# creating CSV file to be used | |
file = open(os.path.expanduser(r"TripAdviserReviews.csv"), "wb") | |
file.write( | |
b"Organization,Address,Reviewer,Review Title,Review,Review Count,Location,Rating Date,Rating" + b"\n") | |
# List the first page of the reviews (ends with "#REVIEWS") - separate the websites with , | |
WebSites = ["https://www.tripadvisor.com.sg/Hotel_Review-g60763-d93623-Reviews-Trump_International_Hotel_and_Tower_New_York-New_York_City_New_York.html"] | |
# looping through each site until it hits a break | |
for theurl in WebSites: | |
reviewNumb = 0 | |
thepage = urllib.request.urlopen(theurl) | |
soup = BeautifulSoup(thepage, "html.parser") | |
while True: | |
Organization = soup.find(attrs={"class": "header heading fr"}).text.replace('"', ' ').replace('Review of',' ').strip() | |
Address = soup.findAll(attrs={"class": "street-address"})[0].text.replace(',', '').replace('\n', '').strip() | |
# Loop through each review on the page | |
#for x in range(0, len(soup.findAll(attrs={"class": "avatarWrapper"}))): | |
for x in range(5) | |
try: | |
Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text | |
except: | |
Reviewer = "N/A" | |
continue | |
Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip() | |
ReviewCount = soup.findAll(attrs={"class": "badgetext"})[x].text.split(' ', 1)[0].strip() | |
try: | |
Location = soup.findAll(attrs={"class": "location"})[x].text.replace(',', ' ').strip() | |
except: | |
Location= 'Unknown' | |
ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“','').replace('"', '').replace('é', 'e').strip() | |
Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip() | |
RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',' ').replace(',', ' ').strip() | |
Rating = soup.findAll(attrs={"class": "rating reviewItemInline"})[x].find(attrs={'class':'ui_bubble_rating'})['class'] | |
Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + ReviewCount + "," + Location + "," + RatingDate + "," + Rating[len(Rating)-1] | |
file.write(bytes(Record, encoding="ascii", errors='ignore') + b"\n") | |
link = soup.find_all(attrs={"class": "nav next taLnk "}) | |
print(Organization) | |
if len(link) == 0: | |
break | |
else: | |
urlparts = theurl.split('Reviews-') | |
reviewNumb += 5 | |
soup = BeautifulSoup(urllib.request.urlopen(urlparts[0] + 'Reviews-or' + str(reviewNumb) + '-' +urlparts[1]),"html.parser") | |
#print(link[0].get('href')) | |
#Checker = link[0].get('href')[-7:] | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment