debboutr · October 18, 2018 07:20
diff --git a/scrap.py b/scrap.py
 import time
 import random
 import requests
 import numpy as np
 from ssl import SSLError
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from requests import ConnectionError

 url = 'https://www.homedepot.com/sitemap/d/plp_sitemap.xml'
 response = requests.get(url, headers=headers)
 data = response.text
 soup = BeautifulSoup(data)
 groups = soup.find_all('loc')
 out = [group.text for group in groups]
 comp = []
 for url in out:
    response = requests.get(url, headers=headers)
    data = response.text
    soup = BeautifulSoup(data)
    links = soup.find_all('loc')
    comp += [link.text for link in links]

 group_urls = list(set(comp))

 product_urls = []
 skips = []
 count = 0
 for url in group_urls:
    count += 1
    print(count)
    current = url # keeps the original for indexing where we're at if error
    while url:
        if type(url) == Tag:
            print('~next!')
            url = f"https://www.homedepot.com{url['href']}"
        if url[:4] != 'http':
            url = f"https://www.homedepot.com{url}"
        try:
            response = requests.get(url, headers=headers)
        except UnicodeDecodeError:
            skips.append(url)
            break
        except ConnectionError:
            print('sleeping...ConnectionError')
            time.sleep(600)
            response = requests.get(url, headers=headers)
        except SSLError:
            print('sleeping...SSLError')
            time.sleep(600)
            response = requests.get(url, headers=headers)
        data = response.text
        soup = BeautifulSoup(data)
        content = soup.find("div", {"class": "mainContent"})
        if not content:
            break
        loads = content.find_all("a", {'data-pod-type': 'pr'})
        product_urls = product_urls + [load['href'] for load in loads]
        url = content.find('a', {'class':'hd-pagination__link','title':'Next'})


 #☻
 print('☻'*47)

 np.save('check_urls.npy',list(set(product_urls)))

 # some URLs came in as collections, ~1200
 collect = []
 for t in total:
    if t[:3] != '/p/':
        collect.append(t)

 # make file to open many tabs at once
 rand = random.sample(total, 10)
 with open('open_home_depot_links_1.bat','w') as bat_file:
    for x in rand:
        bat_file.write(f"start chrome.exe https://www.homedepot.com{x}\n")
        print(f"`https://www.homedepot.com{x}`")
        print('*'*50)

 # =============================================================================
 # below gets all of the grouped urls from the sitemap w/o the header groups
 # =============================================================================

 #url = 'https://www.homedepot.com/c/site_map'
 #url = ('https://www.homedepot.com/p/Rust-Oleum-Specialty-29-oz-Countertop-'
 #       'Coating-Tint-Base-246068/202820906')
 #
 #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
 #
 #response  = requests.get(url, headers=headers)
 #response.status_code
 #data = response.text
 #soup = BeautifulSoup(data)
 #soup.findAll('<a>')
 #soup.find_all('a')
 #mydivs = soup.findAll("div", {"class": "content"})
 #a = mydivs[0]
 #
 #col = a.find_all('a')
 #len(a.find_all('a'))
 #link = col[0]
 ## find if wrapped w/ <b> tag??
 #b = link.findParent()
 #b.get
 #
 #count=0
 #grouped_urls = []
 #for anchor in mydivs:
 #    for link in anchor.find_all('a'):
 #        grouped = link.findParent().name != 'b'
 #        b_val = 'https://www.homedepot.com/b' in link['href']
 #        if grouped and b_val:
 #            print(link['href'])
 #            count += 1
 #            grouped_urls.append(link['href'])


 #https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class
 #soup.findAll('div',
 #             {'class': lambda x: x
 #                       and 'stylelistrow' in x.split()
 #             }
 #            )
	import time
	import random
	import requests
	import numpy as np
	from ssl import SSLError
	from bs4 import BeautifulSoup
	from bs4.element import Tag
	from requests import ConnectionError

	url = 'https://www.homedepot.com/sitemap/d/plp_sitemap.xml'
	response = requests.get(url, headers=headers)
	data = response.text
	soup = BeautifulSoup(data)
	groups = soup.find_all('loc')
	out = [group.text for group in groups]
	comp = []
	for url in out:
	response = requests.get(url, headers=headers)
	data = response.text
	soup = BeautifulSoup(data)
	links = soup.find_all('loc')
	comp += [link.text for link in links]

	group_urls = list(set(comp))

	product_urls = []
	skips = []
	count = 0
	for url in group_urls:
	count += 1
	print(count)
	current = url # keeps the original for indexing where we're at if error
	while url:
	if type(url) == Tag:
	print('~next!')
	url = f"https://www.homedepot.com{url['href']}"
	if url[:4] != 'http':
	url = f"https://www.homedepot.com{url}"
	try:
	response = requests.get(url, headers=headers)
	except UnicodeDecodeError:
	skips.append(url)
	break
	except ConnectionError:
	print('sleeping...ConnectionError')
	time.sleep(600)
	response = requests.get(url, headers=headers)
	except SSLError:
	print('sleeping...SSLError')
	time.sleep(600)
	response = requests.get(url, headers=headers)
	data = response.text
	soup = BeautifulSoup(data)
	content = soup.find("div", {"class": "mainContent"})
	if not content:
	break
	loads = content.find_all("a", {'data-pod-type': 'pr'})
	product_urls = product_urls + [load['href'] for load in loads]
	url = content.find('a', {'class':'hd-pagination__link','title':'Next'})


	#☻
	print('☻'*47)

	np.save('check_urls.npy',list(set(product_urls)))

	# some URLs came in as collections, ~1200
	collect = []
	for t in total:
	if t[:3] != '/p/':
	collect.append(t)

	# make file to open many tabs at once
	rand = random.sample(total, 10)
	with open('open_home_depot_links_1.bat','w') as bat_file:
	for x in rand:
	bat_file.write(f"start chrome.exe https://www.homedepot.com{x}\n")
	print(f"`https://www.homedepot.com{x}`")
	print(''50)

	# =============================================================================
	# below gets all of the grouped urls from the sitemap w/o the header groups
	# =============================================================================

	#url = 'https://www.homedepot.com/c/site_map'
	#url = ('https://www.homedepot.com/p/Rust-Oleum-Specialty-29-oz-Countertop-'
	# 'Coating-Tint-Base-246068/202820906')
	#
	#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
	#
	#response = requests.get(url, headers=headers)
	#response.status_code
	#data = response.text
	#soup = BeautifulSoup(data)
	#soup.findAll('<a>')
	#soup.find_all('a')
	#mydivs = soup.findAll("div", {"class": "content"})
	#a = mydivs[0]
	#
	#col = a.find_all('a')
	#len(a.find_all('a'))
	#link = col[0]
	## find if wrapped w/ <b> tag??
	#b = link.findParent()
	#b.get
	#
	#count=0
	#grouped_urls = []
	#for anchor in mydivs:
	# for link in anchor.find_all('a'):
	# grouped = link.findParent().name != 'b'
	# b_val = 'https://www.homedepot.com/b' in link['href']
	# if grouped and b_val:
	# print(link['href'])
	# count += 1
	# grouped_urls.append(link['href'])


	#https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class
	#soup.findAll('div',
	# {'class': lambda x: x
	# and 'stylelistrow' in x.split()
	# }
	# )