Last active
October 18, 2018 07:20
-
-
Save debboutr/3073a23b734fa05374dd9357f13acfc1 to your computer and use it in GitHub Desktop.
scraper script built for home depot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import random | |
import requests | |
import numpy as np | |
from ssl import SSLError | |
from bs4 import BeautifulSoup | |
from bs4.element import Tag | |
from requests import ConnectionError | |
url = 'https://www.homedepot.com/sitemap/d/plp_sitemap.xml' | |
response = requests.get(url, headers=headers) | |
data = response.text | |
soup = BeautifulSoup(data) | |
groups = soup.find_all('loc') | |
out = [group.text for group in groups] | |
comp = [] | |
for url in out: | |
response = requests.get(url, headers=headers) | |
data = response.text | |
soup = BeautifulSoup(data) | |
links = soup.find_all('loc') | |
comp += [link.text for link in links] | |
group_urls = list(set(comp)) | |
product_urls = [] | |
skips = [] | |
count = 0 | |
for url in group_urls: | |
count += 1 | |
print(count) | |
current = url # keeps the original for indexing where we're at if error | |
while url: | |
if type(url) == Tag: | |
print('~next!') | |
url = f"https://www.homedepot.com{url['href']}" | |
if url[:4] != 'http': | |
url = f"https://www.homedepot.com{url}" | |
try: | |
response = requests.get(url, headers=headers) | |
except UnicodeDecodeError: | |
skips.append(url) | |
break | |
except ConnectionError: | |
print('sleeping...ConnectionError') | |
time.sleep(600) | |
response = requests.get(url, headers=headers) | |
except SSLError: | |
print('sleeping...SSLError') | |
time.sleep(600) | |
response = requests.get(url, headers=headers) | |
data = response.text | |
soup = BeautifulSoup(data) | |
content = soup.find("div", {"class": "mainContent"}) | |
if not content: | |
break | |
loads = content.find_all("a", {'data-pod-type': 'pr'}) | |
product_urls = product_urls + [load['href'] for load in loads] | |
url = content.find('a', {'class':'hd-pagination__link','title':'Next'}) | |
#☻ | |
print('☻'*47) | |
np.save('check_urls.npy',list(set(product_urls))) | |
# some URLs came in as collections, ~1200 | |
collect = [] | |
for t in total: | |
if t[:3] != '/p/': | |
collect.append(t) | |
# make file to open many tabs at once | |
rand = random.sample(total, 10) | |
with open('open_home_depot_links_1.bat','w') as bat_file: | |
for x in rand: | |
bat_file.write(f"start chrome.exe https://www.homedepot.com{x}\n") | |
print(f"`https://www.homedepot.com{x}`") | |
print('*'*50) | |
# ============================================================================= | |
# below gets all of the grouped urls from the sitemap w/o the header groups | |
# ============================================================================= | |
#url = 'https://www.homedepot.com/c/site_map' | |
#url = ('https://www.homedepot.com/p/Rust-Oleum-Specialty-29-oz-Countertop-' | |
# 'Coating-Tint-Base-246068/202820906') | |
# | |
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} | |
# | |
#response = requests.get(url, headers=headers) | |
#response.status_code | |
#data = response.text | |
#soup = BeautifulSoup(data) | |
#soup.findAll('<a>') | |
#soup.find_all('a') | |
#mydivs = soup.findAll("div", {"class": "content"}) | |
#a = mydivs[0] | |
# | |
#col = a.find_all('a') | |
#len(a.find_all('a')) | |
#link = col[0] | |
## find if wrapped w/ <b> tag?? | |
#b = link.findParent() | |
#b.get | |
# | |
#count=0 | |
#grouped_urls = [] | |
#for anchor in mydivs: | |
# for link in anchor.find_all('a'): | |
# grouped = link.findParent().name != 'b' | |
# b_val = 'https://www.homedepot.com/b' in link['href'] | |
# if grouped and b_val: | |
# print(link['href']) | |
# count += 1 | |
# grouped_urls.append(link['href']) | |
#https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class | |
#soup.findAll('div', | |
# {'class': lambda x: x | |
# and 'stylelistrow' in x.split() | |
# } | |
# ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment