Created
May 4, 2013 03:53
-
-
Save shriphani/5516090 to your computer and use it in GitHub Desktop.
Clean text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Cleaner exports a set of functions to do stemming etc. | |
| """ | |
| #!/usr/bin/env python | |
| import string | |
| from nltk.stem import porter | |
| STEMMER = porter.PorterStemmer() | |
| LINE_OPS_STEM = [ | |
| lambda s : ' '.join(s.split()), # newlines tabs etc taken care of | |
| lambda s : ' '.join([w for w in s.split() if not w.startswith('http')]), #nuke links | |
| lambda s : s.translate( | |
| string.maketrans( | |
| string.punctuation + string.uppercase + string.digits, | |
| ' '*len(string.punctuation) + string.lowercase + ' '*len(string.digits) | |
| ) | |
| ), # nukes out punctuation, digits and converts to lowercase | |
| lambda s : ''.join( | |
| map( | |
| lambda c : c if c in string.printable else ' ', | |
| s | |
| ) | |
| ), # ensures ASCII only | |
| lambda s : ' '.join( | |
| map( | |
| lambda w : STEMMER.stem(w), | |
| s.split() | |
| ) | |
| ), | |
| lambda s : ' '.join( | |
| filter( | |
| lambda w : len(w) > 3, | |
| s.split() | |
| ) | |
| ) # ensures that only string of length 3 and above are in the user text | |
| ] | |
| LINE_OPS = [ | |
| lambda s : ' '.join(s.split()), # newlines tabs etc taken care of | |
| lambda s : ' '.join([w for w in s.split() if not w.startswith('http')]), #nuke links | |
| lambda s : s.translate( | |
| string.maketrans( | |
| string.punctuation + string.uppercase + string.digits, | |
| ' '*len(string.punctuation) + string.lowercase + ' '*len(string.digits) | |
| ) | |
| ), # nukes out punctuation, digits and converts to lowercase | |
| lambda s : ''.join( | |
| map( | |
| lambda c : c if c in string.printable else ' ', | |
| s | |
| ) | |
| ), # ensures ASCII only | |
| lambda s : ' '.join( | |
| filter( | |
| lambda w : len(w) > 3, | |
| s.split() | |
| ) | |
| ) # ensures that only string of length 3 and above are in the user text | |
| ] | |
| def clean_text(text): | |
| global LINE_OPS | |
| for op in LINE_OPS: | |
| text = op(text) | |
| return ' '.join(text.split()) | |
| def clean_text_and_stem(text): | |
| global LINE_OPS_STEM | |
| for op in LINE_OPS_STEM: | |
| text = op(text) | |
| return ' '.join(text.split()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment