Created
May 17, 2015 19:00
-
-
Save ntrrgc/828a649b15c031276285 to your computer and use it in GitHub Desktop.
Rank keyword density
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import nltk | |
| import nltk.tokenize.regexp | |
| import nltk.stem.snowball | |
| import nltk.corpus | |
| import requests | |
| import lxml.html | |
| lang = 'spanish' | |
| def get_text(): | |
| r = requests.get(sys.argv[1]) | |
| h = lxml.html.fromstring(r.content.decode('UTF-8')) | |
| return '\n'.join(h.xpath('//article//*[not(ancestor-or-self::code)]/text()')) | |
| text = get_text() | |
| tokens = nltk.tokenize.regexp.RegexpTokenizer(r'[^ .,;\n:]+').tokenize(text) | |
| stopwords = set(nltk.corpus.stopwords.words(lang)) | |
| tokens = [t.lower() for t in tokens if | |
| t.lower() not in stopwords] | |
| stemmer = nltk.stem.snowball.SnowballStemmer(lang) | |
| class FreqCounter(object): | |
| def __init__(self): | |
| self.instances = {} | |
| def add(self, instance): | |
| if instance in self.instances: | |
| self.instances[instance] += 1 | |
| else: | |
| self.instances[instance] = 1 | |
| def rank(self): | |
| return sorted(self.instances.items(), | |
| key=lambda p: -p[1]) | |
| stem_freq = FreqCounter() | |
| stem_origins = {} | |
| for t in tokens: | |
| stem = stemmer.stem(t) | |
| if stem not in stem_origins: | |
| stem_origins[stem] = FreqCounter() | |
| stem_origins[stem].add(t) | |
| stem_freq.add(stem) | |
| for (stem, occurences) in stem_freq.rank()[:30]: | |
| freq = occurences / len(tokens) | |
| word = stem_origins[stem].rank()[0][0] | |
| print('%15s %.2f%%' % (word, freq * 100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment