-
-
Save shaungt1/9dba5ebd14cf016779bc545dd7c3ddf2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrape_recipe(br, year, idnumber): | |
#This is called when user wants to scrape for specific recipe site | |
#Try functions were used to prevent any one element from stopping the operation | |
#recipe title | |
try: | |
rtitle = br.find_element_by_tag_name('h1').text | |
except: | |
rtitle = 'NA' | |
#Star rating | |
try: | |
starrating = br.find_element_by_class_name('rating-stars').\ | |
get_attribute('data-ratingstars') | |
except: | |
starrating = 'NA' | |
#Number of people who clicked that they "made it" | |
try: | |
madeitcount = br.find_element_by_class_name('made-it-count').text | |
except: | |
madeitcount = 'NA' | |
#Number of reviews | |
try: | |
reviewcount = br.find_element_by_class_name('review-count').text | |
reviewcount = str(re.findall('(\w+) reviews', reviewcount)[0]) | |
except: | |
reviewcount = 'NA' | |
#calories per serving | |
try: | |
calcount = br.find_element_by_class_name('calorie-count').text | |
calcount = str(re.findall('(\w+) cals', calcount)[0]) | |
except: | |
calcount = 'NA' | |
#prep time | |
try: | |
prepTime = br.find_element_by_xpath('//time[@itemprop = "prepTime"]').\ | |
get_attribute('datetime') | |
prepTime = str(re.findall('PT(\w+)', prepTime)[0]) | |
except: | |
prepTime = 'NA' | |
#Cook time | |
try: | |
cookTime = br.find_element_by_xpath('//time[@itemprop = "cookTime"]').\ | |
get_attribute('datetime') | |
cookTime = str(re.findall('PT(\w+)', cookTime)[0]) | |
except: | |
cookTime = 'NA' | |
#total time | |
try: | |
totalTime = br.find_element_by_xpath('//time[@itemprop = "totalTime"]').\ | |
get_attribute('datetime') | |
totalTime = str(re.findall('PT(\w+)', totalTime)[0]) | |
except: | |
totalTime = 'NA' | |
#find all the ingredient attributes | |
ingred = br.find_elements_by_class_name("checkList__item") | |
#Go through all ingredients and collect text | |
ingredients = [] | |
for x in np.arange(len(ingred)-1): | |
ingredients.append(str(ingred[x].text.encode('ascii', 'ignore'))) | |
#update mongoDB with ingredients entry | |
for ingr in ingredients: | |
temp = {'idnumber': idnumber, 'year': year, 'ingredient': ingr.encode('ascii', 'ignore')} | |
collection2.insert(temp)#listingr.append(year+'\t'+idnumber+'\t'+ingr) | |
#Update mongoDB with recipe entry | |
temp = {'idnumber': idnumber, 'year': year, 'recipe_title': rtitle.encode('ascii', 'ignore'), \ | |
'star_rating': starrating, 'made_it_count': madeitcount, 'review_count': reviewcount, \ | |
'cal_count': calcount, 'prep_time': prepTime, 'cook_time': cookTime, 'total_time': totalTime} | |
collection.insert(temp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment