Skip to content

Instantly share code, notes, and snippets.

@olivx
Created January 15, 2019 18:52
Show Gist options
  • Save olivx/5c05e50928a0f58cd40dd46f00ee20c9 to your computer and use it in GitHub Desktop.
Save olivx/5c05e50928a0f58cd40dd46f00ee20c9 to your computer and use it in GitHub Desktop.
simple keywords nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
punctuations = ['(',')',';',':','[',']',',']
stop_words = stopwords.words('portuguese')
tokens = set(word_tokenize(text_from_pdf.lower(), language='portuguese'))
token_keywords = [
word.encode('ascii', 'ignore').decode('ascii')
for word in tokens if not word in stop_words and not word in punctuations
]
keywords = ' '.join(token_keywords)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment