Created
September 11, 2022 20:58
-
-
Save Abhayparashar31/b279833988ae56fba04ea7eeefe8827d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
import gensim | |
from gensim.models.ldamulticore import LdaMulticore | |
from gensim import corpora, models | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
lemmatizer = WordNetLemmatizer() | |
def clean_text(lst): | |
cleaned_text = [] | |
stopword = stopwords.words("english") | |
## Text Cleaning (Removing Punctuations, Stopwords, Tokenization and Lemmatization) | |
for text in lst: | |
text = str(text).lower() | |
text = re.sub(r'[^\w ]+', "", text) | |
text = " ".join([lemmatizer.lemmatize(word,pos='v') for word in word_tokenize(text) if not word in set(stopword) and len(word)>3]) | |
cleaned_text.append(text) | |
return cleaned_text | |
def make_biagram(data,tokens): | |
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases. | |
bigram_mod = gensim.models.phrases.Phraser(bigram) | |
return [bigram_mod[doc] for doc in tokens] | |
cleaned_reviews = clean_text(headlines) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment