Skip to content

Instantly share code, notes, and snippets.

@rohithteja
Last active August 23, 2021 13:02
Show Gist options
  • Select an option

  • Save rohithteja/8c13680a539d9e07a63828779980a85a to your computer and use it in GitHub Desktop.

Select an option

Save rohithteja/8c13680a539d9e07a63828779980a85a to your computer and use it in GitHub Desktop.
Sentiment Analysis Preprocessing
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
#read data
df = pd.read_csv('data/train.csv')
nltk.download('stopwords')
url = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)
(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([
^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
tokenizer = RegexpTokenizer(r'\w+')
def clean_data(temp):
temp = temp.map(lambda x:str(x).lower()) #lower case
temp = temp.map(lambda x:re.sub(r"\b[^\s]+@[^\s]+[.][^\s]+\b", "", x)) #email
temp = temp.map(lambda x:re.sub(url, "", x)) #url
temp = temp.map(lambda x:re.sub(r'[^a-zA-z.,!?/:;\"\'\s]', "", x)) #numbers
temp = temp.map(lambda x:re.sub(r'^\s*|\s\s*', ' ', x).strip()) #white space
temp = temp.map(lambda x:''.join([c for c in x if c not in string.punctuation])) #punctuations
temp = temp.map(lambda x:re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', x)) #special char
temp = temp.map(lambda x:unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')) #unicode
temp = temp.map(lambda x:tokenizer.tokenize(x))
temp = temp.map(lambda x:[i for i in x if i not in stopwords.words('english')])
temp = temp.map(lambda x:' '.join(x))
return temp
df.text = clean_data(df.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment