Last active
June 1, 2019 14:16
-
-
Save adityajn105/a1a363f0df44571cb10eeecbbc4d3d2a to your computer and use it in GitHub Desktop.
A web scrapping tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import urllib3 | |
import re | |
import pandas as pd | |
http = urllib3.PoolManager() | |
link = "https://www.sitejabber.com/reviews/dream11.com" | |
#making http get request | |
r = http.request('GET', link) | |
#getting all html data | |
soup = BeautifulSoup(r.data, 'lxml') | |
#getting all tags staring with ReviewText | |
reviews = soup.findAll("p", {"id": re.compile('^ReviewText')}) | |
#getting all review text | |
reviews = list(map(lambda x: x.text, reviews)) | |
titles = soup.findAll("div", {"class": "review_title"}) | |
#first title is not needed | |
titles = titles[1:] | |
#getting title text from html tag | |
titles = list(map( lambda x: x.a.text[1:-1], titles )) | |
#getting authors url | |
authors = soup.findAll("div",{"class":'author_name'}) | |
authors = list(map(lambda x: "https://www.sitejabber.com"+x.a['href'], authors)) | |
#creating a dataframe | |
df = pd.DataFrame({ 'title':titles, 'author':authors, 'review':reviews }) | |
df.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment