Skip to content

Instantly share code, notes, and snippets.

@jaidevd
Created July 24, 2014 16:47
Show Gist options
  • Save jaidevd/164a5d410da51afcefba to your computer and use it in GitHub Desktop.
Save jaidevd/164a5d410da51afcefba to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
#attendance_data = pd.read_csv('AttendanceData.csv',index_col=2,
# parse_dates=True)
#attendance_data.columns = ['event_id', 'user_id']
event_data = pd.read_csv('EventData.csv',#skipfooter=1,
index_col=0)
event_data.columns = ['latitude', 'longitude']
event_data.dropna(inplace=True)
user_data = pd.read_csv('UserData.csv', #skipfooter=1,
index_col=0)
user_data.columns = ['latitude', 'longitude']
user_data.dropna(inplace=True)
training_data = pd.read_csv('ratings_for_train.csv')
#skipfooter=1)
training_data.columns = ['user_id','event_id','rating']
training_data.dropna(inplace=True)
#testing_data = pd.read_csv('ratings_for_eval.csv',usecols=['User ID', 'Event ID'])
#testing_data.columns = ['user_id','event_id']
def clean_missing(training_data, user_data, event_data):
"""
Drop training examples for users and events whose data isn't available.
"""
users_in_training = training_data['user_id'].unique()
events_in_training = training_data['event_id'].unique()
for uid in users_in_training:
if uid not in user_data.index:
inds = training_data[training_data['user_id'] == uid].index
training_data.drop(inds, inplace=True)
for eid in events_in_training:
if eid not in event_data.index:
inds = training_data[training_data['event_id'] == eid].index
training_data.drop(inds, inplace=True)
return training_data, user_data, event_data
#def fill_coordinates(training_data, user_data, event_data):
# """
# Add latitude and longitude columns to the training data containing the
# coordinates for each row.
# """
# training_data['ulat'] = np.zeros((training_data.shape[0],))
# training_data['ulong'] = np.zeros((training_data.shape[0],))
# training_data['elat'] = np.zeros((training_data.shape[0],))
# training_data['elong'] = np.zeros((training_data.shape[0],))
# event_ids = event_data.index.unique()
# user_ids = user_data.index.unique()
# for uid in user_ids:
# ulat, ulong = user_data.ix[uid]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment