Skip to content

Instantly share code, notes, and snippets.

@adityajn105
Last active June 1, 2019 14:16
Show Gist options
  • Save adityajn105/a1a363f0df44571cb10eeecbbc4d3d2a to your computer and use it in GitHub Desktop.
Save adityajn105/a1a363f0df44571cb10eeecbbc4d3d2a to your computer and use it in GitHub Desktop.
A web scrapping tutorial
from bs4 import BeautifulSoup
import urllib3
import re
import pandas as pd
http = urllib3.PoolManager()
link = "https://www.sitejabber.com/reviews/dream11.com"
#making http get request
r = http.request('GET', link)
#getting all html data
soup = BeautifulSoup(r.data, 'lxml')
#getting all tags staring with ReviewText
reviews = soup.findAll("p", {"id": re.compile('^ReviewText')})
#getting all review text
reviews = list(map(lambda x: x.text, reviews))
titles = soup.findAll("div", {"class": "review_title"})
#first title is not needed
titles = titles[1:]
#getting title text from html tag
titles = list(map( lambda x: x.a.text[1:-1], titles ))
#getting authors url
authors = soup.findAll("div",{"class":'author_name'})
authors = list(map(lambda x: "https://www.sitejabber.com"+x.a['href'], authors))
#creating a dataframe
df = pd.DataFrame({ 'title':titles, 'author':authors, 'review':reviews })
df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment