rohithteja · August 23, 2021 13:02
diff --git a/sentiment-preprocessing.py b/sentiment-preprocessing.py
 import pandas as pd
 import numpy as np
 import re
 import string
 import unicodedata
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import RegexpTokenizer


 #read data
 df = pd.read_csv('data/train.csv')

 nltk.download('stopwords')

 url = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)
 (?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([
  ^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''

 tokenizer = RegexpTokenizer(r'\w+')

 def clean_data(temp):
  temp = temp.map(lambda x:str(x).lower()) #lower case
  temp = temp.map(lambda x:re.sub(r"\b[^\s]+@[^\s]+[.][^\s]+\b", "", x)) #email
  temp = temp.map(lambda x:re.sub(url, "", x)) #url
  temp = temp.map(lambda x:re.sub(r'[^a-zA-z.,!?/:;\"\'\s]', "", x)) #numbers
  temp = temp.map(lambda x:re.sub(r'^\s*|\s\s*', ' ', x).strip()) #white space
  temp = temp.map(lambda x:''.join([c for c in x if c not in string.punctuation])) #punctuations
  temp = temp.map(lambda x:re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', x)) #special char
  temp = temp.map(lambda x:unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')) #unicode
  temp = temp.map(lambda x:tokenizer.tokenize(x)) 
  temp = temp.map(lambda x:[i for i in x if i not in stopwords.words('english')]) 
  temp = temp.map(lambda x:' '.join(x))
  return temp

 df.text = clean_data(df.text)
	import pandas as pd
	import numpy as np
	import re
	import string
	import unicodedata
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import RegexpTokenizer


	#read data
	df = pd.read_csv('data/train.csv')

	nltk.download('stopwords')

	url = r'''(?i)\b((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)
	(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+\|(\([
	^\s()<>]+\)))*\)\|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''

	tokenizer = RegexpTokenizer(r'\w+')

	def clean_data(temp):
	temp = temp.map(lambda x:str(x).lower()) #lower case
	temp = temp.map(lambda x:re.sub(r"\b[^\s]+@[^\s]+[.][^\s]+\b", "", x)) #email
	temp = temp.map(lambda x:re.sub(url, "", x)) #url
	temp = temp.map(lambda x:re.sub(r'[^a-zA-z.,!?/:;\"\'\s]', "", x)) #numbers
	temp = temp.map(lambda x:re.sub(r'^\s\|\s\s', ' ', x).strip()) #white space
	temp = temp.map(lambda x:''.join([c for c in x if c not in string.punctuation])) #punctuations
	temp = temp.map(lambda x:re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', x)) #special char
	temp = temp.map(lambda x:unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')) #unicode
	temp = temp.map(lambda x:tokenizer.tokenize(x))
	temp = temp.map(lambda x:[i for i in x if i not in stopwords.words('english')])
	temp = temp.map(lambda x:' '.join(x))
	return temp

	df.text = clean_data(df.text)
No results found