Last active
November 10, 2017 11:03
-
-
Save Gabrock94/66e6717eb583a55555c7597a58f558fa to your computer and use it in GitHub Desktop.
Script tht downloads votes and tags from AWWWARDS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
AWWWARD Web Scraper | |
Created on Thu Nov 9 14:32:36 2017 | |
@author: Giulio Gabrieli | |
""" | |
############################################################################### | |
# # | |
# Libraries # | |
# # | |
############################################################################### | |
""" | |
""" | |
import os #used for reading files and directories | |
import requests #used to get web pages | |
from bs4 import BeautifulSoup #used to do scraping on web pages | |
import re #for regular expressions | |
import numpy as np | |
import pickle | |
############################################################################### | |
# # | |
# PARAMETERS # | |
# # | |
############################################################################### | |
""" | |
Here you can change the paramaters used in this script | |
""" | |
""" PATHS """ | |
basepath = os.path.dirname(os.path.realpath(__file__)) #This get the basepath of the script | |
winnersURL = "https://www.awwwards.com/websites/sites_of_the_day/?page=" #base url of winners' pages | |
nominationURL = "https://www.awwwards.com/websites/nominees/?page=" #base url of nominations' pages | |
siteURL = "https://www.awwwards.com/sites/" #base url of websites | |
""" Here you can set the numner of pages to scrap for each category""" | |
numberOfPages = 30 #number of pages to scrap for each category | |
############################################################################### | |
# # | |
# MAIN # | |
# # | |
############################################################################### | |
""" | |
This sections handles our MAIN process | |
""" | |
if(__name__ == "__main__"): | |
""" NOMINATIONS """ | |
""" First we create a List of all the nominations websites """ | |
nominationsWebsites = [] | |
for numberOfPage in range(1,numberOfPages+1): | |
print("Page number: ",numberOfPage) #print the current page number | |
page = requests.get(nominationURL + str(numberOfPage)) #get a page | |
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page | |
""" Here we look for the link to each nomination page""" | |
for ultag in soup.find_all('ul', {'class': 'list-items list-flex'}): #look for the unordered list containing the nominations | |
for litag in ultag.find_all('li'): #look for each element of the list | |
links = litag.find_all("a") #get the links | |
thisLink = ((links[0].get("href"))) #get the url of the links | |
if("/sites/" in thisLink): | |
thisLink = thisLink.replace("/sites/","") | |
nominationsWebsites.append(thisLink) | |
nominationsWebsites = set(nominationsWebsites) #eliminate duplicates | |
nominationDatabase = [] | |
""" now we parse each website and we get basic informations and informations on the votes """ | |
for website in nominationsWebsites: | |
print(website) | |
try: | |
pageURL = siteURL + website | |
page = requests.get(pageURL) #get a page | |
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page | |
websiteData = {} #initialize a dictionary | |
websiteData["Name"] = soup.find_all('h1', {'class': 'heading-medium'})[0].get_text() #get the name of the website | |
authorParagraph = soup.find_all('p', {'class': ''})[0].get_text() #get the author of the website | |
""" we need to clean it up """ | |
patter = r'.*?\n(.*).\n.*' | |
match = re.search(patter, authorParagraph) | |
websiteData["Author"] = match.group(1) | |
websiteData["Link"] = soup.find_all('a', {'class': 'item-link'})[0].get("href") #get the URL of the website | |
websiteData["Tag"] = [li.get_text() for li in soup.find_all("div",{'class': 'list-tags'})[0].find_all('li') if li.get_text() != "\nsites\n"] #get the tags associated to the website | |
""" Then we get the users votes """ | |
design = [] | |
usability = [] | |
creativity = [] | |
content = [] | |
#TODO: Currently it takes up to 20 voters. Need a fix to load all the votes | |
votes = soup.find_all('ul',{'class': 'list-circle-notes js-circle-notes'}) #get the list of Voters | |
for vote in votes: #for each vote we take the four paramaters and we store it inot a list | |
design.append(vote.find_all('li',{'class':'design'})[0].get('data-note')) | |
usability.append(vote.find_all('li',{'class':'usability'})[0].get('data-note')) | |
creativity.append(vote.find_all('li',{'class':'creativity'})[0].get('data-note')) | |
content.append(vote.find_all('li',{'class':'content'})[0].get('data-note')) | |
""" for each parameter, we save the mean, std and list """ | |
websiteData["Votes"] = {} | |
websiteData["Votes"]["Design"] = {"mean": np.mean([float(x) for x in design]), "std":np.std([float(x) for x in design]),"votes": design} | |
websiteData["Votes"]["Usability"] = {"mean": np.mean([float(x) for x in usability]), "std":np.std([float(x) for x in design]),"votes": usability} | |
websiteData["Votes"]["Creativity"] = {"mean": np.mean([float(x) for x in creativity]), "std":np.std([float(x) for x in design]),"votes": creativity} | |
websiteData["Votes"]["Content"] = {"mean": np.mean([float(x) for x in content]), "std":np.std([float(x) for x in design]),"votes": content} | |
nominationDatabase.append(websiteData) | |
except: | |
print("Skipped") | |
with open(basepath + '/nominationWebsites.pkl', 'wb') as f: | |
pickle.dump(nominationDatabase, f) | |
""" WINNERS """ | |
""" First we create a List of all the nominations websites """ | |
winnersWebsites = [] | |
for numberOfPage in range(1,numberOfPages+1): | |
print("Page number: ",numberOfPage) #print the current page number | |
page = requests.get(winnersURL + str(numberOfPage)) #get a page | |
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page | |
""" Here we look for the link to each nomination page""" | |
for ultag in soup.find_all('ul', {'class': 'list-items list-flex'}): #look for the unordered list containing the nominations | |
for litag in ultag.find_all('li'): #look for each element of the list | |
links = litag.find_all("a") #get the links | |
thisLink = ((links[0].get("href"))) #get the url of the links | |
if("/sites/" in thisLink): | |
thisLink = thisLink.replace("/sites/","") | |
winnersWebsites.append(thisLink) | |
winnersWebsites = set(winnersWebsites) #eliminate duplicates | |
winnersDatabase = [] | |
""" now we parse each website and we get basic informations and informations on the votes """ | |
for website in winnersWebsites: | |
print(website) | |
try: | |
pageURL = siteURL + website | |
page = requests.get(pageURL) #get a page | |
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page | |
websiteData = {} #initialize a dictionary | |
websiteData["Name"] = soup.find_all('h1', {'class': 'heading-large'})[0].get_text() #get the name of the website | |
authorParagraph = soup.find_all('div', {'class': 'by'})[0].find_all('strong')[0].get_text() #get the author of the website | |
""" we need to clean it up """ | |
pattern = r'.*?\n(.*)\n.*' | |
match = re.search(pattern, authorParagraph) | |
websiteData["Author"] = match.group(1) | |
websiteData["Link"] = soup.find_all('a', {'class': 'item-link'})[0].get("href") #get the URL of the website | |
websiteData["Tag"] = [li.get_text() for li in soup.find_all("div",{'class': 'list-tags'})[0].find_all('li') if li.get_text() != "\nsites\n"] #get the tags associated to the website | |
""" Then we get the users votes """ | |
design = [] | |
usability = [] | |
creativity = [] | |
content = [] | |
#TODO: Currently it takes up to 20 voters. Need a fix to load all the votes | |
votes = soup.find_all('ul',{'class': 'list-circle-notes js-circle-notes'}) #get the list of Voters | |
for vote in votes: #for each vote we take the four paramaters and we store it inot a list | |
design.append(vote.find_all('li',{'class':'design'})[0].get('data-note')) | |
usability.append(vote.find_all('li',{'class':'usability'})[0].get('data-note')) | |
creativity.append(vote.find_all('li',{'class':'creativity'})[0].get('data-note')) | |
content.append(vote.find_all('li',{'class':'content'})[0].get('data-note')) | |
""" for each parameter, we save the mean, std and list """ | |
websiteData["Votes"] = {} | |
websiteData["Votes"]["Design"] = {"mean": np.mean([float(x) for x in design]), "std":np.std([float(x) for x in design]),"votes": design} | |
websiteData["Votes"]["Usability"] = {"mean": np.mean([float(x) for x in usability]), "std":np.std([float(x) for x in design]),"votes": usability} | |
websiteData["Votes"]["Creativity"] = {"mean": np.mean([float(x) for x in creativity]), "std":np.std([float(x) for x in design]),"votes": creativity} | |
websiteData["Votes"]["Content"] = {"mean": np.mean([float(x) for x in content]), "std":np.std([float(x) for x in design]),"votes": content} | |
winnersDatabase.append(websiteData) | |
except: | |
print("Skipped") | |
with open(basepath + '/winnersDatabase.pkl', 'wb') as f: | |
pickle.dump(winnersDatabase, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment