Skip to content

Instantly share code, notes, and snippets.

@soeirosantos
Last active October 31, 2016 21:25
Show Gist options
  • Save soeirosantos/2f442a4b1864fae8f66311b2d86bec9f to your computer and use it in GitHub Desktop.
Save soeirosantos/2f442a4b1864fae8f66311b2d86bec9f to your computer and use it in GitHub Desktop.
A very simple and dummy word counter
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import feedparser, operator
import re
from stop_words import get_stop_words
from time import gmtime, strftime
stop_words = get_stop_words('en')
OUTPUT_FOLDER = 'out/'
def execute():
feed = feedparser.parse('https://news.ycombinator.com/bigrss')
titles = map(lambda item: item['title'], feed['items'])
words = []
def words_concat(w):
'''
uses just one list to store
all word occurrences
'''
words.extend(w.split())
def clean_word(w):
'''
clean a specific word, removing
all these chars below, puts it
in lower case and removes space
around
'''
return re.sub('[-!$%^&*()_+|~=`{}\[\]:";\'<>?,.\/\#]', '', w.lower().strip())
map(words_concat, titles)
cleaned_words = map(clean_word, words)
cleaned_stop_words = map(clean_word, stop_words)
cleaned_words = filter(lambda x: x not in cleaned_stop_words, cleaned_words)
qty_per_word = dict()
def count_word(w):
'''
count words
'''
if qty_per_word.has_key(w):
qty_per_word[w] += 1
else:
qty_per_word[w] = 1
map(count_word, cleaned_words)
output = open(OUTPUT_FOLDER + strftime("%Y%m%d_%H%M%S", gmtime()), 'w')
for w in sorted(qty_per_word.items(), key=lambda x: x[1], reverse=True):
try:
if w[1] > 1:
output.write(w[0] + " " + str(w[1]) + "\n")
else:
break
except UnicodeEncodeError: #just a little fix
pass
output.close()
if __name__ == '__main__':
execute()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment