Created
December 10, 2016 14:34
-
-
Save omundy/df021bf3b817301e485a98d233f6ca68 to your computer and use it in GitHub Desktop.
NLTK POS Smell Tagging Examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## NLTK POS Smell Tagging Examples" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"POS tag reference:\n", | |
"https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# initialize library\n", | |
"import nltk\n", | |
"nltk.data.path.append(\"/Users/owmundy/Documents/_code/Python/nltk_data\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('My', 'PRP$'),\n", | |
" ('wife', 'NN'),\n", | |
" ('bought', 'VBD'),\n", | |
" ('maple', 'JJ'),\n", | |
" ('bacon', 'NN'),\n", | |
" ('hand', 'NN'),\n", | |
" ('sanitizer', 'NN'),\n", | |
" ('.', '.'),\n", | |
" ('I', 'PRP'),\n", | |
" ('am', 'VBP'),\n", | |
" ('in', 'IN'),\n", | |
" ('a', 'DT'),\n", | |
" ('meeting', 'NN'),\n", | |
" ('smelling', 'VBG'),\n", | |
" ('like', 'IN'),\n", | |
" ('a', 'DT'),\n", | |
" ('gay', 'NN'),\n", | |
" ('lumberjack', 'NN')]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = VBG = Verb, gerund or present participle\n", | |
"# smell_object = maple bacon hand sanitizer\n", | |
"# smell_modifier = \n", | |
"# smell_smell = gay lumberjack | maple bacon hand sanitizer\n", | |
"# smell_sentiment = positive\n", | |
"s = \"My wife bought maple bacon hand sanitizer. I am in a meeting smelling like a gay lumberjack\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('The', 'DT'),\n", | |
" ('smell', 'NN'),\n", | |
" ('of', 'IN'),\n", | |
" ('hot', 'JJ'),\n", | |
" ('apple', 'NN'),\n", | |
" ('cider', 'NN'),\n", | |
" ('makes', 'VBZ'),\n", | |
" ('me', 'PRP'),\n", | |
" ('so', 'IN'),\n", | |
" ('happy', 'JJ'),\n", | |
" ('#', '#'),\n", | |
" ('fall', 'NN')]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = NN = Noun, singular or mass\n", | |
"# smell_object = hot apple cider\n", | |
"# smell_modifier = happy\n", | |
"# smell_smell = hot apple cider\n", | |
"# smell_sentiment = positive\n", | |
"s = \"The smell of hot apple cider makes me so happy #fall\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('The', 'DT'),\n", | |
" ('first', 'JJ'),\n", | |
" ('rain', 'NN'),\n", | |
" ('always', 'RB'),\n", | |
" ('smells', 'VBZ'),\n", | |
" ('so', 'RB'),\n", | |
" ('heavenly', 'RB'),\n", | |
" ('?', '.')]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = VBZ = Verb, 3rd person singular present\n", | |
"# smell_object = first rain\n", | |
"# smell_modifier = heavenly\n", | |
"# smell_smell = first rain\n", | |
"# smell_sentiment = positive\n", | |
"s = \"The first rain always smells so heavenly ?\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('Confirmed', 'NNP'),\n", | |
" ('Structure', 'NNP'),\n", | |
" ('Fire', 'NNP'),\n", | |
" ('Ridaucrest', 'NNP'),\n", | |
" ('Tower', 'NNP'),\n", | |
" ('via', 'IN'),\n", | |
" ('Rideau', 'NNP'),\n", | |
" ('Street', 'NNP'),\n", | |
" ('.', '.'),\n", | |
" ('Reportedly', 'RB'),\n", | |
" ('on', 'IN'),\n", | |
" ('1st', 'CD'),\n", | |
" ('floor', 'NN'),\n", | |
" ('.', '.'),\n", | |
" ('Strong', 'NNP'),\n", | |
" ('smell', 'NN'),\n", | |
" ('of', 'IN'),\n", | |
" ('burnt', 'JJ'),\n", | |
" ('plastic/rubber', 'NN'),\n", | |
" ('.', '.'),\n", | |
" ('@', 'NN'),\n", | |
" ('YGKTraffic', 'NN')]" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = NN\n", | |
"# smell_object = Structure Fire \n", | |
"# smell_modifier = strong\n", | |
"# smell_smell = burnt plastic\n", | |
"# smell_sentiment = negative\n", | |
"s = \"Confirmed Structure Fire Ridaucrest Tower via Rideau Street. Reportedly on 1st floor. Strong smell of burnt plastic/rubber.@YGKTraffic\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('I', 'PRP'), ('smell', 'VBP'), ('money', 'NN')]" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = VBP = Verb, non-3rd person singular present\n", | |
"# smell_object = money\n", | |
"# smell_modifier = \n", | |
"# smell_smell = \n", | |
"# smell_sentiment = \n", | |
"s = \"I smell money\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('Can', 'MD'),\n", | |
" ('you', 'PRP'),\n", | |
" ('say', 'VB'),\n", | |
" ('insurance', 'NN'),\n", | |
" ('fraud', 'NN'),\n", | |
" ('?', '.'),\n", | |
" ('That', 'IN'),\n", | |
" ('what', 'WP'),\n", | |
" ('I', 'PRP'),\n", | |
" ('smell', 'VBP'),\n", | |
" ('https', 'NNS'),\n", | |
" (':', ':'),\n", | |
" ('//t.co/EuxLLYPqFN', 'NN')]" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = VBP\n", | |
"# smell_object = insurance fraud \n", | |
"# smell_modifier = \n", | |
"# smell_smell = \n", | |
"# smell_sentiment = negative\n", | |
"s = \"Can you say insurance fraud? That what I smell https://t.co/EuxLLYPqFN\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('@', 'JJ'),\n", | |
" ('ThePME', 'NNP'),\n", | |
" ('missing', 'VBG'),\n", | |
" ('Browns', 'NNP'),\n", | |
" ('?', '.'),\n", | |
" ('They', 'PRP'),\n", | |
" ('stink', 'VBP'),\n", | |
" ('but', 'CC'),\n", | |
" ('running', 'VBG'),\n", | |
" ('the', 'DT'),\n", | |
" ('ball', 'NN'),\n", | |
" ('well', 'RB'),\n", | |
" ('.', '.')]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# smell_word = VBP\n", | |
"# smell_object = They / Browns\n", | |
"# smell_modifier = \n", | |
"# smell_smell = \n", | |
"# smell_sentiment = negative\n", | |
"s = \"@ThePME missing Browns? They stink but running the ball well.\"\n", | |
"tokens = nltk.word_tokenize(s) # tokenize words\n", | |
"tagged = nltk.pos_tag(tokens) # parts of speech tagging\n", | |
"tagged" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment