Created
January 4, 2016 19:46
-
-
Save bstancil/ab2c22ce382879032e98 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
import nltk, string | |
from bs4 import BeautifulSoup | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
stemmer = nltk.stem.porter.PorterStemmer() | |
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) | |
def stem_tokens(tokens): | |
return [stemmer.stem(item) for item in tokens] | |
'''remove punctuation, lowercase, stem''' | |
def normalize(text): | |
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) | |
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') | |
def cosine_sim(text1, text2): | |
tfidf = vectorizer.fit_transform([text1, text2]) | |
return ((tfidf * tfidf.T).A)[0,1] | |
def write_to_csv(csv_name,array): | |
columns = len(array[0]) | |
rows = len(array) | |
with open(csv_name, "wb") as test_file: | |
file_writer = csv.writer(test_file) | |
for i in range(rows): | |
file_writer.writerow([array[i][j] for j in range(columns)]) | |
url = "http://abc.go.com/shows/the-bachelor/cast" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text) | |
links = soup.findAll("a", {"class":"details-link"}) | |
bios = [] | |
for l in links[2:]: | |
href = l["href"] | |
print href | |
r = requests.get("http://abc.go.com/" + href) | |
soup = BeautifulSoup(r.text) | |
answers = soup.find("div", {"class":"expandable-section"}) | |
each = answers.findAll("p") | |
all_text = "" | |
for e in each: | |
tag = e.strong | |
if tag: | |
tag.string = "" | |
all_text = all_text + " " + e.text + " " | |
entry = [href, all_text] | |
bios.append(entry) | |
print "done!" | |
ben = ("Bachelor Nation was heartbroken when fan-favorite Ben H, the charming software salesman, was sent home by Kaitlyn Bristowe on The Bachelorette. Ben saw a “great life” with Kaitlyn, only to have it disappear before him when he was left without a rose. It wasn’t easy for Ben to open himself up to love on The Bachelorette because he’s been hurt in past relationships. However, now knowing he is capable of being in love and being loved, he is ready to put the heartbreak behind him as he searches for his one true love. " + | |
"Ben grew up in the Indiana town of Warsaw in America’s heartland. While Ben currently resides in Denver, Colorado, his heart will always be in Indiana with his family. Ben is the only son of loving, married parents David and Amy, a union that has been going strong for thirty-four years. He wants nothing more than to find the kind of love that his parents have. And he’s ready. While only 26, Ben is wise beyond his years. An avid traveler and a champion of varied cultures, he has worked at a zoo in Peru, traveled through the jungles of Bolivia, and even hiked the spiritual trails of famed Machu Picchu. " + | |
"Since appearing on The Bachelorette, Ben’s life has changed dramatically. Adoring fans approach him in restaurants and young women wait outside his house for a chance to meet him. While his life has changed, Ben hasn’t. He’s still the same, big-hearted man looking for someone to share his life with. " + | |
"This successful, sensitive and handsome 6’4” bachelor loves basketball, hiking, fishing, and stimulating conversation. He almost has it all. Now he just needs the right woman to love, and for her to love him back. And he truly believes he will find his soul mate on The Bachelor.") | |
matches = [] | |
for b in bios: | |
person = b[0] | |
bio = b[1] | |
score = cosine_sim(ben, bio) | |
matches.append([person,score]) | |
write_to_csv("bachelor.csv",matches) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment