Skip to content

Instantly share code, notes, and snippets.

@dwillis
Created April 20, 2017 02:18
Show Gist options
  • Save dwillis/9478a6c8cd7f01956caff4483a6dbb8b to your computer and use it in GitHub Desktop.
Save dwillis/9478a6c8cd7f01956caff4483a6dbb8b to your computer and use it in GitHub Desktop.
import csv
import requests
from BeautifulSoup import BeautifulSoup
urls = ["http://aaspeechesdb.oscars.org/results.aspx?AC=PREV_RECORD&XC=/results.aspx&BU=http%3A%2F%2Faaspeechesdb.oscars.org%2Findex.aspx&TN=aatrans&SN=AUTO9337&SE=1168&RN=1&MR=0&TR=0&TX=1000&ES=0&CS=0&XP=&RF=WebReportList&EF=&DF=WebReportOscars&RL=0&EL=0&DL=0&NP=255&ID=&MF=oscarsmsg.ini&MQ=&TI=0&DT=&ST=0&IR=0&NR=0&NB=0&SV=0&SS=0&BG=&FG=&QS=&OEX=ISO-8859-1&OEH=utf-8", "http://aaspeechesdb.oscars.org/results.aspx?AC=NEXT_RECORD&XC=/results.aspx&BU=http%3A%2F%2Faaspeechesdb.oscars.org%2Findex.aspx&TN=aatrans&SN=AUTO9337&SE=1168&RN=0&MR=0&TR=0&TX=1000&ES=0&CS=0&XP=&RF=WebReportList&EF=&DF=WebReportOscars&RL=0&EL=0&DL=0&NP=255&ID=&MF=oscarsmsg.ini&MQ=&TI=0&DT=&ST=0&IR=0&NR=0&NB=0&SV=0&SS=0&BG=&FG=&QS=&OEX=ISO-8859-1&OEH=utf-8"]
word_list = []
for url in urls:
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
speech = soup.find("p", { "class" : "MInormal" })
speech_words = speech.text.split(':')[1].split() # take the speech text and split it on the first colon so we can drop the actor's name and just have the speech. then split the speech on spaces to get words.
word_list.append(speech_words)
# when done appending all the urls you want, we'll need to "flatten" word_list so that it's a single Python array, not an array of arrays.
word_list_flattened = [word for speech in word_list for word in speech]
# next, count the words in word_list_flattened. try the solution here using collections: http://stackoverflow.com/questions/20510768/python-count-frequency-of-words-in-a-list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment