Created
June 29, 2018 19:14
-
-
Save himlohiya/52334de78346d36d2675035fb75c9ad7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| named_entities = [] | |
| for sentence in corpus: | |
| temp_entity_name = '' | |
| temp_named_entity = None | |
| sentence = nlp(sentence) | |
| for word in sentence: | |
| term = word.text | |
| tag = word.ent_type_ | |
| if tag: | |
| temp_entity_name = ' '.join([temp_entity_name, term]).strip() | |
| temp_named_entity = (temp_entity_name, tag) | |
| else: | |
| if temp_named_entity: | |
| named_entities.append(temp_named_entity) | |
| temp_entity_name = '' | |
| temp_named_entity = None | |
| entity_frame = pd.DataFrame(named_entities, | |
| columns=['Entity Name', 'Entity Type']) | |
| # get the top named entities | |
| top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type']) | |
| .size() | |
| .sort_values(ascending=False) | |
| .reset_index().rename(columns={0 : 'Frequency'})) | |
| top_entities.T.iloc[:,:15] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment