santhalakshminarayana · January 6, 2020 05:36
diff --git a/web_scrapping_quotes_2.py b/web_scrapping_quotes_2.py
 def get_quotes(url):
    i = 1
    quotes = []
    while True:
        curr_quotes = []
        quote_url = url + 'page-' + str(i) + '/'
        i += 1
        quote_r = requests.get(quote_url)
        quote_soup = BeautifulSoup(quote_r.content, 'html5lib')
        quote_list = quote_soup.find('div', attrs = {'class':'quote_list'})
        for j,quote in enumerate(quote_list.select('div p')):
            if j%3 == 1:
                quote = quote.text.replace(u'\xa0', u'')
                quote = ' '.join(re.findall(r'\w+[\']?\w+[,.]?', quote))
                curr_quotes.append(quote)
        if (len(quotes) != 0) and (curr_quotes[-1] == quotes[-1]) :
            break
        else:
            quotes.extend(curr_quotes)
    return quotes
  
 curr_path = os.getcwd()
 btime = time.time()
 with open(curr_path+'/quotes.txt', 'w') as f:
    for i in range(len(quote_hrefs)):
        quotes = get_quotes(home_url + quote_hrefs[i])
        for quote in quotes:
            if quote[-1] != '.':
                quote += '.'
            f.write(quote + '\n')
 atime = time.time()
	def get_quotes(url):
	i = 1
	quotes = []
	while True:
	curr_quotes = []
	quote_url = url + 'page-' + str(i) + '/'
	i += 1
	quote_r = requests.get(quote_url)
	quote_soup = BeautifulSoup(quote_r.content, 'html5lib')
	quote_list = quote_soup.find('div', attrs = {'class':'quote_list'})
	for j,quote in enumerate(quote_list.select('div p')):
	if j%3 == 1:
	quote = quote.text.replace(u'\xa0', u'')
	quote = ' '.join(re.findall(r'\w+[\']?\w+[,.]?', quote))
	curr_quotes.append(quote)
	if (len(quotes) != 0) and (curr_quotes[-1] == quotes[-1]) :
	break
	else:
	quotes.extend(curr_quotes)
	return quotes

	curr_path = os.getcwd()
	btime = time.time()
	with open(curr_path+'/quotes.txt', 'w') as f:
	for i in range(len(quote_hrefs)):
	quotes = get_quotes(home_url + quote_hrefs[i])
	for quote in quotes:
	if quote[-1] != '.':
	quote += '.'
	f.write(quote + '\n')
	atime = time.time()