Skip to content

Instantly share code, notes, and snippets.

@santhalakshminarayana
Created January 6, 2020 05:36
Show Gist options
  • Save santhalakshminarayana/3d5ff7a4f6ae570222fcbdd268416a9e to your computer and use it in GitHub Desktop.
Save santhalakshminarayana/3d5ff7a4f6ae570222fcbdd268416a9e to your computer and use it in GitHub Desktop.
Web Scrapping 2 - Medium
def get_quotes(url):
i = 1
quotes = []
while True:
curr_quotes = []
quote_url = url + 'page-' + str(i) + '/'
i += 1
quote_r = requests.get(quote_url)
quote_soup = BeautifulSoup(quote_r.content, 'html5lib')
quote_list = quote_soup.find('div', attrs = {'class':'quote_list'})
for j,quote in enumerate(quote_list.select('div p')):
if j%3 == 1:
quote = quote.text.replace(u'\xa0', u'')
quote = ' '.join(re.findall(r'\w+[\']?\w+[,.]?', quote))
curr_quotes.append(quote)
if (len(quotes) != 0) and (curr_quotes[-1] == quotes[-1]) :
break
else:
quotes.extend(curr_quotes)
return quotes
curr_path = os.getcwd()
btime = time.time()
with open(curr_path+'/quotes.txt', 'w') as f:
for i in range(len(quote_hrefs)):
quotes = get_quotes(home_url + quote_hrefs[i])
for quote in quotes:
if quote[-1] != '.':
quote += '.'
f.write(quote + '\n')
atime = time.time()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment