Created
August 12, 2014 10:30
-
-
Save anjesh/7a39b583e3bf111b6d80 to your computer and use it in GitHub Desktop.
This scores the given news based on the words defined in the feature.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from nltk.tokenize import WordPunctTokenizer, PunktWordTokenizer | |
import string | |
from os import listdir | |
from os.path import isfile, join | |
import logging | |
logger = logging.getLogger(__name__) | |
handler = logging.FileHandler('news-scrapper-debug.log') | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
handler.setFormatter(formatter) | |
logger.addHandler(handler) | |
logger.setLevel(logging.DEBUG) | |
def extract_words(text): | |
''' | |
here we are extracting features to use in our classifier. We want to pull all the words in our input | |
porterstem them and grab the most significant bigrams to add to the mix as well. | |
''' | |
# text = text.translate(None, string.punctuation) | |
stemmer = PorterStemmer() | |
tokenizer = PunktWordTokenizer() | |
tokens = tokenizer.tokenize(text) | |
result = [stemmer.stem(x.lower()) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 1] | |
return result | |
class FeatureCollection: | |
features = {} | |
def add(self, id, name, text): | |
logger.info("Adding feature id:%s name:%s to collection" % (id, name)) | |
if id in self.features: | |
logger.critical("Duplication id:%s already exists in FeatureCollection" % (id)) | |
raise Exception('Duplication: ' + id + ' already exists in FeatureCollection') | |
self.features[id] = FeatureClass(id, name, text) | |
return self.features[id] | |
def getFeatureObj(self, id): | |
return self.features[id] | |
class FeatureClass: | |
id = 0 | |
name = "" | |
text = "" | |
feature = {} | |
def __init__(self, id, name, text): | |
self.id = id | |
self.name = name | |
self.text = text | |
self.feature = list(set(extract_words(self.text))) | |
self.featureCount = len(self.feature) | |
def countWord(word, tokens): | |
if str(type(word)) == "<type 'unicode'>": | |
word = word.encode('utf-8') | |
if word in tokens: | |
return tokens.count(word) | |
return 0 | |
class NewsScorer: | |
def __init__(self, content, featureCollection = ""): | |
self.content = content | |
self.tokens = extract_words(content) | |
self.featureCollection = featureCollection | |
self.scores = {} | |
if featureCollection: | |
self.calculateAll() | |
def calculateAll(self): | |
for featureId in self.featureCollection.features: | |
logger.info("Calculating score for news for feature id:%s name:%s" % (featureId, self.featureCollection.features[featureId].name)) | |
self.scores[featureId] = self.calculateFeatureScore(self.featureCollection.features[featureId])['score'] | |
def calculateFeatureScore(self, featureObj): | |
score = 0 | |
found = {} | |
for word in featureObj.feature: | |
count = countWord(word, self.tokens) | |
if count >= 1: | |
found[word] = count | |
score = score + count | |
logger.debug("Found words: %s" % found) | |
score = 1.0 * score / featureObj.featureCount | |
return {'score': score, 'words': found} | |
if __name__ == '__main__': | |
featureCollection = FeatureCollection() | |
featureCollection.add('agriculture', 'agriculture', open('../data/features/agriculture-1.txt', 'r').read()) | |
featureCollection.add('tourism', 'tourism', open('../data/features/tourism-1.txt', 'r').read()) | |
# newsScore = NewsScorer(open('../data/news/agri2.txt').read(),featureCollection) | |
# print newsScore.scores | |
mypath = "../data/news" | |
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] | |
for myfile in onlyfiles: | |
content = open(join(mypath,myfile)).read() | |
newsScore = NewsScorer(content,featureCollection) | |
print myfile, newsScore.scores | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This Python file uses the following encoding: utf-8 | |
# place this file in the same folder containing news_score.py | |
from news_score import * | |
featureCollection = FeatureCollection() | |
feature = featureCollection.add('agriculture-1', 'agriculture', "Vegetable farmers agriculture") | |
newsScore = NewsScorer("Vegetable is good. Farmers should produce more vegetables.") | |
result = newsScore.calculateFeatureScore(feature) | |
print result | |
featureunicode = featureCollection.add('agriculture-unicode', 'agriculture', "किसान तरकारी".decode("utf8")) | |
for word in featureunicode.feature: | |
print word.encode('utf-8') | |
newsScore = NewsScorer("किसान मेहेनत गर्छ") | |
result = newsScore.calculateFeatureScore(featureunicode) | |
print result | |
for word in result['words']: | |
print word.encode('utf-8'), ":", result['words'][word] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment