Skip to content

Instantly share code, notes, and snippets.

@manashmandal
Last active October 11, 2016 08:26
Show Gist options
  • Save manashmandal/5de04693599fe0d267a196c6788de5e4 to your computer and use it in GitHub Desktop.
Save manashmandal/5de04693599fe0d267a196c6788de5e4 to your computer and use it in GitHub Desktop.
import pandas as pd
import os
import numpy as np
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
positive_text_path = './txt_sentoken/pos/'
negative_text_path = './txt_sentoken/neg/'
yelp_labelled = 'yelp_labelled.txt'
imdb_labelled = 'imdb_labelled.txt'
amazon_cells_labelled = 'amazon_cells_labelled.txt'
review_training_dataset = 'training.txt'
text_sentiment_columns = ['Text', 'Sentiment']
def get_data_frame(positive_file_path=positive_text_path, negative_file_path=negative_text_path):
# Getting the imdb review dataset
positive_files = [positive_text_path + pos_file_name for pos_file_name in os.listdir(positive_text_path)]
negative_files = [negative_text_path + neg_file_name for neg_file_name in os.listdir(negative_text_path)]
positive_texts = []
negative_texts = []
for f in positive_files:
with open(f, 'r') as posfile:
positive_texts.append(posfile.read())
for f in negative_files:
with open(f, 'r') as negfile:
negative_texts.append(negfile.read())
# Creating dataframes
pos_dataframe = pd.DataFrame(positive_texts)
neg_dataframe = pd.DataFrame(negative_texts)
# Assigning positive sentiment to 1 and negative sentiment to 0
pos_dataframe['Sentiment'] = 1
neg_dataframe['Sentiment'] = 0
# Adding the labels
pos_dataframe.columns = text_sentiment_columns
neg_dataframe.columns = text_sentiment_columns
# Getting Yelp dataset
yelp_df = pd.read_csv(yelp_labelled, delimiter='\t')
yelp_df.columns = text_sentiment_columns
# Getting imdb movie review dataset
imdb_df = pd.read_csv(imdb_labelled, delimiter='\t')
imdb_df.columns = text_sentiment_columns
# Getting amazon product review dataset
amazon_df = pd.read_csv(amazon_cells_labelled, delimiter='\t')
amazon_df.columns = text_sentiment_columns
# imdb review training dataset
imdb_training_df = pd.read_csv(review_training_dataset, delimiter='\t')
imdb_training_df.columns = text_sentiment_columns
# Swapping the columns
imdb_training_df['Sentiment'], imdb_training_df['Text'] = imdb_training_df['Text'], imdb_training_df['Sentiment']
# Resetting the column labels
imdb_df.columns = text_sentiment_columns
# Merging the dataframes into one
total_dataframe = [pos_dataframe, neg_dataframe, imdb_df, amazon_df, imdb_training_df, yelp_df]
df = pd.concat(total_dataframe)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment