Created
March 15, 2018 01:55
-
-
Save cuuupid/84da5801e632b41301ec5a932c1f0054 to your computer and use it in GitHub Desktop.
Identify keywords in text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lsa import LsaSummarizer as Summarizer | |
from sumy.nlp.stemmers import Stemmer | |
from sumy.utils import get_stop_words | |
LANG = "english" | |
tokenizer = Tokenizer(LANG) | |
stemmer = Stemmer(LANG) | |
summarizer = Summarizer(stemmer) | |
summarizer.stop_words = get_stop_words(LANG) | |
def LSA(text): | |
parser = PlaintextParser.from_string(text, tokenizer) | |
sentences = list([str(s) for s in summarizer(parser.document, 10)]) | |
return ' '.join(sentences) | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
sw = set(stopwords.words(LANG)) | |
def process(text): | |
summarized = LSA(text) | |
tokenized = word_tokenize(summarized) | |
filtered = [word for word in tokenized if word not in sw] | |
return filtered |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage in plagiarism checker: