Last active
August 29, 2015 13:57
-
-
Save brianchesley/9598044 to your computer and use it in GitHub Desktop.
Carleton News Update
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import urllib2 | |
import urllib | |
import re | |
def get_links(): | |
final_list = [] | |
links_list = [] | |
all_news_links = [] | |
news_links = [] | |
url = "http://apps.carleton.edu/news/news/" | |
carleton_news = urllib2.urlopen(url) | |
html = carleton_news.read() | |
soup = BeautifulSoup(html) | |
all_links = soup.find_all('a') | |
for links in all_links: | |
links_list.append(links.get('href')) | |
for links in links_list: | |
try: | |
if links.find("?") == 0 and links.find("s") == 1: #Wanted to use a regular expression but could not make it work | |
all_news_links.append(links) | |
except: | |
pass | |
for i in all_news_links: | |
if i not in news_links: | |
news_links.append(i) | |
final_news_list = [url + x for x in news_links] | |
for i in final_news_list: | |
final_list.append([i]) | |
return final_list | |
def get_titles(news_list): | |
titles = [] | |
for links in news_list: | |
for link in links: | |
carleton_news = urllib2.urlopen(link).read() | |
soup = BeautifulSoup(carleton_news) | |
titles.append(soup.title.string) | |
return titles | |
def append_titles(news_list,titles): | |
for links in range(len(titles)): | |
for link in range(1): | |
news_list[links].append((titles[links])) | |
news_list[links].append(links) | |
return news_list | |
def get_text(url): | |
carleton_news = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(carleton_news) | |
all_paragraphs = soup.find("div", class_="text").text | |
return all_paragraphs | |
def run(news_list): | |
while True: | |
print "here's the latest from Carleton News: " | |
for links in range(len(news_list)): | |
print "article number " + str(news_list[links][2] + 1) | |
print news_list[links][1] | |
article_num = input("which article would you like to read? ") | |
print news_list[article_num-1][1] | |
print get_text(news_list[article_num-1][0]) | |
more_articles = input("would you like to read more articles? Press 1 if yes ") | |
if more_articles != 1: | |
break | |
run(append_titles(get_links(),get_titles(get_links()))) |
My comments would mostly be to break this up into more functions, in order to give names to the things you're doing at each step. I'd make url a parameter instead of hardcoding it. In lines 30/31, a "set" is would better
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I misunderstood you, looks like you totally can - see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#a-regular-expression