Last active
August 29, 2015 14:22
-
-
Save jamesthomson/8def5d5458c5d158afd0 to your computer and use it in GitHub Desktop.
use pandas to manipulate lastfm listening data into the format i need ready for modelling with sklearn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import data | |
import pandas as pd | |
plays = pd.read_table("usersha1-artmbid-artname-plays-sample.tsv", usecols=[0, 2, 3], names=['user', 'artist', 'plays']) | |
users = pd.read_table("usersha1-profile-sample.tsv", usecols=[0, 1], names=['user', 'gender']) | |
#print plays.head() | |
#print users.head() | |
#clear people who don't know gender for | |
users=users.dropna() | |
#dummy code up gender | |
genders=pd.get_dummies(users['gender']) | |
users=users.join(genders) | |
#alternate dummy code of gender | |
#malemap={'m':1,'f':0} | |
#femalemap={'m':0,'f':1} | |
#users['male']=users['gender'].map(malemap) | |
#users['female']=users['gender'].map(femalemap) | |
#print users.head() | |
#print users.groupby(['gender']).agg(['count']) | |
#find top 20 artists | |
top_artists=plays.groupby('artist').size().order(ascending=False)[:20] | |
#print top_artists | |
#reduce the plays data down to top artists | |
top_plays = plays[plays['artist'].isin(top_artists.index)] | |
#print top_plays.head(50) | |
#pivot data set to make wide and fill in blanks | |
top_plays_t=top_plays.pivot('user', 'artist', 'plays').fillna(0) | |
#print top_plays_t.head() | |
#join to users information | |
to_model=pd.merge(users, top_plays_t, left_on='user', right_index=True, how='left').fillna(0) | |
#print to_model.head() | |
#ready for sklearn | |
Y=to_model['m'].values | |
X=to_model[(to_model.columns.values[4:])].values |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment