Created
December 3, 2018 16:24
-
-
Save bowbowbow/8ae3c4fa278ac3868d4e6cbd6dc5cfa3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import utils | |
| import nltk | |
| from pprint import pprint | |
| def find_document_by_keyword(keyword): | |
| df = utils.data_load() | |
| count = 0 | |
| for index, row in df.iterrows(): | |
| title = row['title'] | |
| body = row[' body'] | |
| if keyword in title or keyword in body: | |
| count += 1 | |
| print('{}: {}'.format(keyword, count)) | |
| def find_sentence_by_keyword(keyword): | |
| df = utils.data_load() | |
| sents = [] | |
| for index, row in df.iterrows(): | |
| title = row['title'] | |
| if keyword in title: sents.append(title) | |
| body = row[' body'] | |
| for sent in nltk.sent_tokenize(body): | |
| if keyword in sent: | |
| sents += nltk.sent_tokenize(sent) | |
| print('{} sents size: {}'.format(keyword, len(sents))) | |
| if __name__ == "__main__": | |
| find_document_by_keyword('bribery') | |
| find_sentence_by_keyword('bribery') | |
| pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment