Last active
December 17, 2019 03:07
-
-
Save calthoff/04ec60021415dc69c796251c9768a0ca to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from urllib.request import Request, urlopen | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import csv | |
| class Scraper: | |
| def __init__(self): | |
| self.links = [] | |
| self.products = {} | |
| def get_links(self): | |
| # This method is a little long. Break it up. | |
| cat=input('Enter product category: ') | |
| strlst = cat.split() | |
| # make one line | |
| # strlst = input('Enter product category: ').split() | |
| self.site=('https://www.macys.com/shop/search?cm_sp=navigation_mew--gn--search-n-n&keyword='+"%20".join(strlst)) | |
| page = requests.get(self.site , headers={'User-Agent': 'Mozilla/5.0'}) | |
| soup = BeautifulSoup(page.text, "html.parser") | |
| for tag in soup.find_all('a',{'class':'productDescLink'}): | |
| p = tag['href'] | |
| self.links.append(p) | |
| i=0 | |
| for link in self.links: | |
| i+=1 | |
| self.site = 'http://macys.com'+link | |
| page = requests.get(self.site , headers={'User-Agent': 'Mozilla/5.0'}) | |
| soup = BeautifulSoup(page.text, "html.parser") | |
| price = soup.find('div',{'class':'price'}).text | |
| price = price.strip() | |
| # combine into one line | |
| # price = soup.find('div',{'class':'price'}).text.strip() | |
| prod_description = soup.find('p',{'itemprop':'description'}).text | |
| prod_description = prod_description.strip() | |
| # combine into one line | |
| #prod_description = soup.find('p',{'itemprop':'description'}).text.strip() | |
| prod_name = soup.find('h1').text | |
| prod_name = prod_name.strip() | |
| # combine into one line | |
| # prod_name = soup.find('h1').text.strip() | |
| prod = [price,prod_description] | |
| self.products[prod_name] = prod | |
| # combine into one line | |
| #self.products[prod_name] = [price,prod_description] | |
| if i == 6: | |
| break | |
| scrape.create_file() | |
| print(self.products) | |
| def create_file(self): | |
| with open("sd.csv","w") as w: | |
| wr = csv.writer(w, delimiter = ';') | |
| # these one letter variable names are confusing. Use longer variable names. | |
| p = self.products | |
| n = p.keys() | |
| m = p.values() | |
| r = [] | |
| t = [] | |
| for x in m: | |
| a = x[0] | |
| b = x[1] | |
| r.append(a) | |
| t.append(b) | |
| q = list(zip(n,r,t)) | |
| wr.writerows(q) | |
| # put a space here | |
| def search_file(self): | |
| sterms = input('Search file (enter search terms): ') | |
| sterms = sterms.split(" ") | |
| # combine to one line | |
| # sterms = input('Search file (enter search terms): ').split() | |
| with open("sd.csv","r") as r: | |
| reader = csv.reader(r, delimiter = ';') | |
| for data in reader: | |
| for x in data: | |
| for term in sterms: | |
| if term in x: | |
| print(data,'\n') | |
| else: | |
| print('Not Found') | |
| scrape = Scraper() | |
| scrape.get_links() | |
| scrape.search_file() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment