Last active
October 11, 2016 08:26
-
-
Save manashmandal/5de04693599fe0d267a196c6788de5e4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import numpy as np | |
import random | |
from sklearn.utils import shuffle | |
import matplotlib.pyplot as plt | |
positive_text_path = './txt_sentoken/pos/' | |
negative_text_path = './txt_sentoken/neg/' | |
yelp_labelled = 'yelp_labelled.txt' | |
imdb_labelled = 'imdb_labelled.txt' | |
amazon_cells_labelled = 'amazon_cells_labelled.txt' | |
review_training_dataset = 'training.txt' | |
text_sentiment_columns = ['Text', 'Sentiment'] | |
def get_data_frame(positive_file_path=positive_text_path, negative_file_path=negative_text_path): | |
# Getting the imdb review dataset | |
positive_files = [positive_text_path + pos_file_name for pos_file_name in os.listdir(positive_text_path)] | |
negative_files = [negative_text_path + neg_file_name for neg_file_name in os.listdir(negative_text_path)] | |
positive_texts = [] | |
negative_texts = [] | |
for f in positive_files: | |
with open(f, 'r') as posfile: | |
positive_texts.append(posfile.read()) | |
for f in negative_files: | |
with open(f, 'r') as negfile: | |
negative_texts.append(negfile.read()) | |
# Creating dataframes | |
pos_dataframe = pd.DataFrame(positive_texts) | |
neg_dataframe = pd.DataFrame(negative_texts) | |
# Assigning positive sentiment to 1 and negative sentiment to 0 | |
pos_dataframe['Sentiment'] = 1 | |
neg_dataframe['Sentiment'] = 0 | |
# Adding the labels | |
pos_dataframe.columns = text_sentiment_columns | |
neg_dataframe.columns = text_sentiment_columns | |
# Getting Yelp dataset | |
yelp_df = pd.read_csv(yelp_labelled, delimiter='\t') | |
yelp_df.columns = text_sentiment_columns | |
# Getting imdb movie review dataset | |
imdb_df = pd.read_csv(imdb_labelled, delimiter='\t') | |
imdb_df.columns = text_sentiment_columns | |
# Getting amazon product review dataset | |
amazon_df = pd.read_csv(amazon_cells_labelled, delimiter='\t') | |
amazon_df.columns = text_sentiment_columns | |
# imdb review training dataset | |
imdb_training_df = pd.read_csv(review_training_dataset, delimiter='\t') | |
imdb_training_df.columns = text_sentiment_columns | |
# Swapping the columns | |
imdb_training_df['Sentiment'], imdb_training_df['Text'] = imdb_training_df['Text'], imdb_training_df['Sentiment'] | |
# Resetting the column labels | |
imdb_df.columns = text_sentiment_columns | |
# Merging the dataframes into one | |
total_dataframe = [pos_dataframe, neg_dataframe, imdb_df, amazon_df, imdb_training_df, yelp_df] | |
df = pd.concat(total_dataframe) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment