Steboss89 · September 18, 2022 12:12
diff --git a/clean_tweet2.py b/clean_tweet2.py
 tweets_df = pd.read_csv("split-data/X_train.csv")
 target_df = pd.read_csv("split-data/y_train.csv")
 # PREPROCESS
 # drop the info we're not going to use 
 # id, date, flag 
 tweets_df.drop(columns=['ids', 'date', 'flag'], inplace=True)
 # start the cleaning process 
 # lower text 
 tweets_df.loc[:,'lower_text'] = tweets_df['text'].str.lower() 
 # remove stopwords 
 tweets_df.loc[:,'clean_text1'] = tweets_df['lower_text'].apply(remove_stopwords)
 # remove puncts 
 tweets_df.loc[:,'clean_text2'] = tweets_df['clean_text1'].apply(remove_specific_chars)
 # remove chars 
 tweets_df.loc[:,'clean_text3'] = tweets_df['clean_text2'].apply(remove_punctuation)
	tweets_df = pd.read_csv("split-data/X_train.csv")
	target_df = pd.read_csv("split-data/y_train.csv")
	# PREPROCESS
	# drop the info we're not going to use
	# id, date, flag
	tweets_df.drop(columns=['ids', 'date', 'flag'], inplace=True)
	# start the cleaning process
	# lower text
	tweets_df.loc[:,'lower_text'] = tweets_df['text'].str.lower()
	# remove stopwords
	tweets_df.loc[:,'clean_text1'] = tweets_df['lower_text'].apply(remove_stopwords)
	# remove puncts
	tweets_df.loc[:,'clean_text2'] = tweets_df['clean_text1'].apply(remove_specific_chars)
	# remove chars
	tweets_df.loc[:,'clean_text3'] = tweets_df['clean_text2'].apply(remove_punctuation)