Last active
October 31, 2016 21:25
-
-
Save soeirosantos/2f442a4b1864fae8f66311b2d86bec9f to your computer and use it in GitHub Desktop.
A very simple and dummy word counter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import feedparser, operator | |
import re | |
from stop_words import get_stop_words | |
from time import gmtime, strftime | |
stop_words = get_stop_words('en') | |
OUTPUT_FOLDER = 'out/' | |
def execute(): | |
feed = feedparser.parse('https://news.ycombinator.com/bigrss') | |
titles = map(lambda item: item['title'], feed['items']) | |
words = [] | |
def words_concat(w): | |
''' | |
uses just one list to store | |
all word occurrences | |
''' | |
words.extend(w.split()) | |
def clean_word(w): | |
''' | |
clean a specific word, removing | |
all these chars below, puts it | |
in lower case and removes space | |
around | |
''' | |
return re.sub('[-!$%^&*()_+|~=`{}\[\]:";\'<>?,.\/\#]', '', w.lower().strip()) | |
map(words_concat, titles) | |
cleaned_words = map(clean_word, words) | |
cleaned_stop_words = map(clean_word, stop_words) | |
cleaned_words = filter(lambda x: x not in cleaned_stop_words, cleaned_words) | |
qty_per_word = dict() | |
def count_word(w): | |
''' | |
count words | |
''' | |
if qty_per_word.has_key(w): | |
qty_per_word[w] += 1 | |
else: | |
qty_per_word[w] = 1 | |
map(count_word, cleaned_words) | |
output = open(OUTPUT_FOLDER + strftime("%Y%m%d_%H%M%S", gmtime()), 'w') | |
for w in sorted(qty_per_word.items(), key=lambda x: x[1], reverse=True): | |
try: | |
if w[1] > 1: | |
output.write(w[0] + " " + str(w[1]) + "\n") | |
else: | |
break | |
except UnicodeEncodeError: #just a little fix | |
pass | |
output.close() | |
if __name__ == '__main__': | |
execute() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment