Skip to content

Instantly share code, notes, and snippets.

@ntrrgc
Created May 17, 2015 19:00
Show Gist options
  • Select an option

  • Save ntrrgc/828a649b15c031276285 to your computer and use it in GitHub Desktop.

Select an option

Save ntrrgc/828a649b15c031276285 to your computer and use it in GitHub Desktop.
Rank keyword density
import sys
import nltk
import nltk.tokenize.regexp
import nltk.stem.snowball
import nltk.corpus
import requests
import lxml.html
lang = 'spanish'
def get_text():
r = requests.get(sys.argv[1])
h = lxml.html.fromstring(r.content.decode('UTF-8'))
return '\n'.join(h.xpath('//article//*[not(ancestor-or-self::code)]/text()'))
text = get_text()
tokens = nltk.tokenize.regexp.RegexpTokenizer(r'[^ .,;\n:]+').tokenize(text)
stopwords = set(nltk.corpus.stopwords.words(lang))
tokens = [t.lower() for t in tokens if
t.lower() not in stopwords]
stemmer = nltk.stem.snowball.SnowballStemmer(lang)
class FreqCounter(object):
def __init__(self):
self.instances = {}
def add(self, instance):
if instance in self.instances:
self.instances[instance] += 1
else:
self.instances[instance] = 1
def rank(self):
return sorted(self.instances.items(),
key=lambda p: -p[1])
stem_freq = FreqCounter()
stem_origins = {}
for t in tokens:
stem = stemmer.stem(t)
if stem not in stem_origins:
stem_origins[stem] = FreqCounter()
stem_origins[stem].add(t)
stem_freq.add(stem)
for (stem, occurences) in stem_freq.rank()[:30]:
freq = occurences / len(tokens)
word = stem_origins[stem].rank()[0][0]
print('%15s %.2f%%' % (word, freq * 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment