wtznc · July 13, 2019 18:52 · wtznc · Jul 13, 2019
diff --git a/hinton.py b/hinton.py
 import requests
 import urllib.request
 import time
 from bs4 import BeautifulSoup

 URL = "http://www.cs.toronto.edu/~hinton/papers.html"
 response = requests.get(URL)
 if(response.status_code == 200):
    print("Correctly downloaded website\nExtracting source code...")
    src = response.text
    soup = BeautifulSoup(src)
    print("Finished!")
 print("Extracting papers...\n")
 papers = []
 raw_papers = soup.find('table').find_all('tr')

 for row in raw_papers:
    year = row.td.text
    if row.select('td')[1].b != None:
        title = row.select('td')[1].b.text
        title = " ".join(title.split())
        
    else:
        title = "title_missing"
        
    authors = row.contents[2].contents[0]
    authors = " ".join(authors.split())
    if row.find('a', href=True) != None:
        paper_url = row.find('a', href=True).attrs['href']
    else:
        paper_url = "missing"
        
    print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n")
    papers.append([str(year), str(authors), str(title), str(paper_url)])
 print("Finished preprocessing articles!")
 print("Removing whitespaces from the name field...")
 for x in range(0, len(papers)):
    for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
        papers[x][1] = papers[x][1].replace(*r)
 print("Done!")
 print("Now let's remove some whitespaces from the titles")
 for x in range(0, len(papers)):
    for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")):
        papers[x][2] = papers[x][2].replace(*r)
 print("We're all set!")
 print('\nAn example of a paper from our list:\n', papers[0])
	import requests
	import urllib.request
	import time
	from bs4 import BeautifulSoup

	URL = "http://www.cs.toronto.edu/~hinton/papers.html"
	response = requests.get(URL)
	if(response.status_code == 200):
	print("Correctly downloaded website\nExtracting source code...")
	src = response.text
	soup = BeautifulSoup(src)
	print("Finished!")
	print("Extracting papers...\n")
	papers = []
	raw_papers = soup.find('table').find_all('tr')

	for row in raw_papers:
	year = row.td.text
	if row.select('td')[1].b != None:
	title = row.select('td')[1].b.text
	title = " ".join(title.split())

	else:
	title = "title_missing"

	authors = row.contents[2].contents[0]
	authors = " ".join(authors.split())
	if row.find('a', href=True) != None:
	paper_url = row.find('a', href=True).attrs['href']
	else:
	paper_url = "missing"

	print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n")
	papers.append([str(year), str(authors), str(title), str(paper_url)])
	print("Finished preprocessing articles!")
	print("Removing whitespaces from the name field...")
	for x in range(0, len(papers)):
	for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
	papers[x][1] = papers[x][1].replace(*r)
	print("Done!")
	print("Now let's remove some whitespaces from the titles")
	for x in range(0, len(papers)):
	for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")):
	papers[x][2] = papers[x][2].replace(*r)
	print("We're all set!")
	print('\nAn example of a paper from our list:\n', papers[0])