Skip to content

Instantly share code, notes, and snippets.

@Abhayparashar31
Created September 11, 2022 20:58
Show Gist options
  • Save Abhayparashar31/b279833988ae56fba04ea7eeefe8827d to your computer and use it in GitHub Desktop.
Save Abhayparashar31/b279833988ae56fba04ea7eeefe8827d to your computer and use it in GitHub Desktop.
import re
import nltk
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def clean_text(lst):
cleaned_text = []
stopword = stopwords.words("english")
## Text Cleaning (Removing Punctuations, Stopwords, Tokenization and Lemmatization)
for text in lst:
text = str(text).lower()
text = re.sub(r'[^\w ]+', "", text)
text = " ".join([lemmatizer.lemmatize(word,pos='v') for word in word_tokenize(text) if not word in set(stopword) and len(word)>3])
cleaned_text.append(text)
return cleaned_text
def make_biagram(data,tokens):
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
return [bigram_mod[doc] for doc in tokens]
cleaned_reviews = clean_text(headlines)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment