Instantly share code, notes, and snippets.
Created
June 8, 2017 07:26
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save Poorvak/85a8494b2997d84179dcef6605bf7a5b to your computer and use it in GitHub Desktop.
NER Term Frquency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Packages used for model training and machine learning\n", | |
"import sys\n", | |
"import operator\n", | |
"\n", | |
"# import gensim\n", | |
"from nltk.tree import Tree\n", | |
"from sklearn.externals import joblib\n", | |
"from nltk import ne_chunk, pos_tag, word_tokenize" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" filename = sys.argv[2]\n", | |
"except:\n", | |
" filename = \".sample_test\"\n", | |
"sample = joblib.load(filename=filename)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[('madhya pradesh', 5), ('rahul gandhi', 3), ('mandsaur', 3), ('congress', 3), ('delhi', 2), ('indian', 2), ('gujarat', 2), ('raf pipliamandi', 2), ('rapid action force', 2), ('raf', 2), ('congress madhya', 2), ('op tripathi', 1), ('neemuch congress', 1), ('mandsaur raf', 1), ('bahujan samaj party', 1), ('mandsaur congress', 1), ('dig ratlam', 1), ('indian express', 1), ('ajay singh', 1), ('neemuch manoj kumar singh', 1), ('rastriya kisan mazdoor sangh', 1), ('rahul gandhi mandsaur', 1), ('mandsaur rajasthan', 1), ('raf garoth', 1), ('rajasthan', 1), ('neemuch sp manoj kumar singh gandhi', 1), ('bollywood', 1), ('india rbi', 1), ('shivraj singh chouhan', 1), ('madhya', 1), ('gandhi', 1), ('ncrb', 1), ('superintendent', 1), ('sunil goud', 1), ('collector', 1), ('madhya congress', 1), ('raf chouhan', 1), ('entertainment', 1), ('state', 1), ('cag congress', 1), ('new', 1), ('sonkach', 1), ('bhopal', 1), ('india indian', 1), ('rashtriya swayamsevak', 1), ('pipliamandi', 1), ('gandhi congress', 1), ('sharad yadav', 1), ('national crime records bureau', 1), ('madhya pradesh shivraj singh chouhan', 1), ('dewas', 1), ('avinash sharma', 1), ('nda', 1), ('sharad yadav district magistrate', 1), ('sangh', 1), ('maharashtra', 1), ('congress congress', 1), ('rashtriya kisan mazdoor sangh', 1), ('vidarbha', 1), ('uttar', 1), ('mantralaya', 1), ('gandhi mandsaur', 1), ('mandsaur sp', 1), ('madhya pradesh madhya pradesh', 1), ('neemuch sp manoj kumar singh', 1), ('janata dal united', 1), ('bjp', 1), ('garoth', 1), ('congress ajay singh', 1), ('madhya nayagaon', 1), ('congress sp', 1), ('bjp indian', 1), ('ani', 1), ('nayagaon', 1), ('janata dal', 1), ('rahul', 1), ('arun yadav gandhi', 1), ('bhupendra singh', 1), ('superintendent mandsaur', 1), ('mandsaur mandsaur collector swatantra kumar singh', 1), ('reserve bank', 1), ('arun yadav', 1), ('gujarat madhya pradesh', 1), ('mandsaur farmers', 1), ('indore', 1), ('cag', 1), ('india india', 1), ('madhya pradesh tamil nadu', 1), ('mandsaur gandhi', 1), ('india', 1), ('mandsaur shivraj singh chouhan', 1), ('manoj kumar singh', 1), ('op shrivastava', 1), ('united', 1), ('singh', 1), ('centre', 1), ('aicc', 1), ('madhya pradesh rajasthan', 1), ('neemuch', 1), ('congress bjp', 1), ('sp', 1), ('abhishek singhvi', 1), ('district magistrate', 1), ('uttar pradesh', 1)]\n" | |
] | |
} | |
], | |
"source": [ | |
"def get_continuous_chunks(text):\n", | |
" chunked = ne_chunk(pos_tag(word_tokenize(text)))\n", | |
" prev = None\n", | |
" continuous_chunk = []\n", | |
" current_chunk = []\n", | |
" for i in chunked:\n", | |
" if type(i) == Tree:\n", | |
" current_chunk.append(\" \".join([token for token, pos in i.leaves()]))\n", | |
" elif current_chunk:\n", | |
" named_entity = \" \".join(current_chunk)\n", | |
" if named_entity not in continuous_chunk:\n", | |
" continuous_chunk.append(named_entity)\n", | |
" current_chunk = []\n", | |
" else:\n", | |
" continue\n", | |
" return continuous_chunk\n", | |
"\n", | |
" \n", | |
"def create_ner_tags(samples, *args, **kwargs):\n", | |
" ner_tf = dict()\n", | |
" for sample in samples:\n", | |
" ner = get_continuous_chunks(text=sample)\n", | |
" for word in ner:\n", | |
" if word.lower() in ner_tf:\n", | |
" ner_tf[word.lower()] += 1\n", | |
" else:\n", | |
" ner_tf[word.lower()] = 1\n", | |
" return ner_tf\n", | |
"\n", | |
"ner_tf = create_ner_tags(samples=sample)\n", | |
"print sorted(ner_tf.items(), key=operator.itemgetter(1), reverse=True)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment