Last active
June 27, 2018 18:09
-
-
Save dustinknopoff/7973cb3fbf896d9149e82cf13642e44b to your computer and use it in GitHub Desktop.
Attempting to Tokenize out best apps ever for MPU'ers.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import nltk | |
import requests | |
from bs4 import BeautifulSoup | |
# def isApp(word): | |
# """ | |
# Checks to see if given word is a Mac or iOS app. | |
# :param word: a valid English word. | |
# :return: True if word is the name of an application. | |
# """ | |
# r = requests.get(f"https://www.macupdate.com/find/mac/{word}").content | |
# soup = BeautifulSoup(r, 'html.parser') | |
# for app in soup.find_all("td", class_="td-app-name"): | |
# possible = app.span.get_text() | |
# if word.capitalize() is possible: | |
# return True | |
# return False | |
def tokenize(alltext): | |
""" | |
Given string, find the most common words longer then 5 chars. long | |
:param alltext: a string of words. | |
:return: Array of 40 most common words with 5+ chars. | |
""" | |
all_words = nltk.tokenize.word_tokenize(alltext) | |
all_word_dist = nltk.FreqDist(w.lower() for w in all_words) | |
stopwords = nltk.corpus.stopwords.words('english') | |
counter = Counter(w.lower() for w in alltext.replace('.', '').replace(',', '').replace("'", "").split() | |
if w not in stopwords and len(w) > 5) | |
# print(counter.most_common(40)) | |
all_word_except_stop_dist = nltk.FreqDist(w.lower() for w in all_word_dist if w not in stopwords and len(w) > 5) | |
most_common = all_word_except_stop_dist.most_common(40) | |
return counter.most_common(100) | |
# def runner(): | |
# posts = get_all() | |
# alltext = "" | |
# for s in posts: | |
# alltext += s | |
# freqwords = tokenize(alltext) | |
# apps = [] | |
# # for word in freqwords: | |
# # print(f"Working on {word}") | |
# # if isApp(word): | |
# # apps.append(word) | |
# print(freqwords) | |
# # | |
# # print(isApp(freqwords[0])) | |
def get_all(): | |
""" | |
Gets all posts from What is Your Favorite App Ever Question on MPU Forum | |
:return: Array of post contents. | |
""" | |
allposts = [] | |
base = "https://talk.macpowerusers.com/t/what-is-your-favorite-app-ever/478?page=" | |
count = 1 | |
while 1: | |
print(count) | |
r = requests.get(base + str(count)).content | |
soup = BeautifulSoup(r, 'html.parser') | |
if 'Oops! That page doesn’t exist or is private.' in soup.h1.get_text(): | |
break | |
else: | |
# print("Here") | |
soup = BeautifulSoup(r, 'html.parser') | |
for post in soup.find_all("div", class_="post"): | |
# print(post) | |
allposts.append(post.get_text()) | |
count += 1 | |
return allposts | |
if __name__ == '__main__': | |
posts = get_all() | |
alltext = "" | |
for s in posts: | |
alltext += s | |
freqwords = tokenize(alltext) | |
print(freqwords) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment