Last active
August 29, 2015 14:25
-
-
Save magnusnissel/1674886e4d0ef9f99d93 to your computer and use it in GitHub Desktop.
A simple class to score text with LabMT and pandas based on the code at http://neuro.imm.dtu.dk/wiki/LabMT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import re | |
| class LabMTScorer(): | |
| """ A simple class to score text with LabMT and pandas, | |
| based on the example provided by Finn Årup Nielsen | |
| at http://neuro.imm.dtu.dk/wiki/LabMT """ | |
| def __init__(self, source=None): | |
| if source is None: | |
| source = ('http://www.plosone.org/article/' | |
| 'fetchSingleRepresentation.action?' | |
| 'uri=info:doi/10.1371/journal.pone.0026752.s001') | |
| labmt_csv = pd.read_csv(source, skiprows=2, sep='\t', index_col=0) | |
| self.avg_happiness = labmt_csv.happiness_average.mean() | |
| self.happiness = ( | |
| labmt_csv.happiness_average - self.avg_happiness).to_dict() | |
| def score_tokens(self, tokens): | |
| score = sum([self.happiness.get(tok.lower(), 0.0) | |
| for tok in tokens]) / len(tokens) | |
| return score | |
| def tokenize(self, text): | |
| tokens = re.split(r"[^0-9A-Za-z\-']+", text) | |
| return tokens | |
| def score_text(self, text): | |
| tokens = self.tokenize(text) | |
| score = self.score_tokens(tokens) | |
| return score | |
| def test(): | |
| test_texts = ["He was a bad, bad man. Utterly corrupt.", | |
| "She's the best. So awesome and great."] | |
| scorer = LabMTScorer() | |
| for test_text in test_texts: | |
| result = scorer.score_text(test_text) | |
| print(test_text, result) | |
| if __name__ == "__main__": | |
| test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment