Created
May 5, 2011 05:49
-
-
Save Ricket/956598 to your computer and use it in GitHub Desktop.
MA 493 Assignment 7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clear all; | |
% load the "MLtraining" matrix | |
load('Examtraining.mat'); | |
% movies.dat: replace :: with @ | |
tic; | |
movies_dat = fopen('movies.dat'); | |
movies = textscan(movies_dat, '%d %s %s', 'delimiter', '@'); | |
[m_ids m_names m_genres] = movies{:}; | |
m_num = length(movies{1}); | |
disp(['Loaded movies.dat in ',num2str(toc),' sec']); | |
% movies: {id name genres} | |
% ratings.dat: replace :: with @ (it will take a while!) | |
tic; | |
ratings_dat = fopen('ratings.dat'); | |
ratings = textscan(ratings_dat, '%d %d %d %d', 'delimiter', '@'); | |
[r_userids r_movieids r_ratings r_timestamps] = ratings{:}; | |
r_num = length(ratings{1}); | |
disp(['Loaded ratings.dat in ',num2str(toc),' sec']); | |
% ratings: {userid movieid rating timestamp} | |
% users.dat: replace :: with @, then replace - with nothing | |
tic; | |
users_dat = fopen('users.dat'); | |
users = textscan(users_dat, '%d %c %d %d %s', 'delimiter', '@'); | |
[u_ids u_genders u_ages u_occupations u_zipcodes] = users{:}; | |
u_num = length(users{1}); | |
disp(['Loaded users.dat in ',num2str(toc),' sec']); | |
% users: {id gender age occupation zipcode} | |
fclose('all'); | |
% Now begin the training process | |
% Build a cell array of all genres | |
% Also simultaneously generate the movies-genres matrix | |
tic; | |
genres = {}; | |
mat_movies_genres = zeros(1, 1); | |
for movidx=1:m_num | |
% Split the genres of this movie into a cell array of individual genres | |
this_genres = regexp(m_genres{movidx},'[|]','split'); | |
this_genres_num = length(this_genres); | |
for m_genre_idx=1:this_genres_num | |
if isempty(find(ismember(genres,this_genres(m_genre_idx))==1, 1)) | |
% Genre hadn't yet been found; add it to genres | |
% To concatenate: genres = [genres 'Blah']; | |
genres = [genres this_genres(m_genre_idx)]; | |
genre_idx = length(genres); | |
else | |
genre_idx = find(ismember(genres,this_genres(m_genre_idx))==1, 1); | |
end | |
mat_movies_genres(movidx,genre_idx) = 1; | |
end | |
end | |
genres_num = length(genres); | |
disp(['Generated movies-genres matrix in ',num2str(toc),' sec']); | |
% Build the genres-[user attributes] matrices | |
tic; | |
mat_genres_genders = zeros(genres_num, 2, 2); | |
mat_genres_ages = zeros(genres_num, 1, 2); | |
mat_genres_occupations = zeros(genres_num, 1, 2); | |
% (ignore zipcodes, it's mostly irrelevant) | |
% The third dimension is used to store sum, num respectively; then | |
% sum/num = avg which is what we want. | |
% TODO: the loop | |
disp(['Generated genres-userattribs matrices in ',num2str(toc),' sec']); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment