Created
December 3, 2018 16:51
-
-
Save bowbowbow/45124dc14e886f89fa566c61a74de4cf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import utils | |
import nltk, datetime | |
from pprint import pprint | |
import spacy | |
nlp = spacy.load('en_core_web_lg') | |
def count_by_keyword(df, keyword): | |
count = 0 | |
for index, row in df.iterrows(): | |
print('index :', index) | |
title = row['title'] | |
body = row[' body'] | |
if keyword in title or keyword in body: | |
count += 1 | |
print('[count_by_keyword] keyword:{}, count:{}'.format(keyword, count)) | |
def find_candidates_by_keyword(df, keyword): | |
candidates = [] | |
for index, row in df.iterrows(): | |
title = row['title'] | |
time = datetime.datetime.strptime(row[' time'], '%Y-%m-%d %H:%M:%S') | |
if keyword in title: candidates.append((title, time, -1)) | |
body = row[' body'] | |
sents = nltk.sent_tokenize(body) | |
for i in range(len(sents)): | |
sent = sents[i] | |
if keyword in sent: | |
candidates.append((sent, time, i)) | |
print('candidates len: {} by keyword({})'.format(len(candidates), keyword)) | |
candidates = sorted(candidates, key=lambda x: [x[1], x[2]]) | |
for candidate in candidates: | |
print(candidate) | |
doc = nlp(candidate[0]) | |
print([(X.text, X.label_) for X in doc.ents]) | |
if __name__ == "__main__": | |
df = utils.data_load() | |
find_candidates_by_keyword(df, 'Warmbier') | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment