Ricket · May 5, 2011 05:49
diff --git a/MA493_A7.m b/MA493_A7.m
 clear all;

 % load the "MLtraining" matrix
 load('Examtraining.mat');

 % movies.dat: replace :: with @
 tic;
 movies_dat = fopen('movies.dat');
 movies = textscan(movies_dat, '%d %s %s', 'delimiter', '@');
 [m_ids m_names m_genres] = movies{:};
 m_num = length(movies{1});
 disp(['Loaded movies.dat in ',num2str(toc),' sec']);
 % movies: {id name genres}

 % ratings.dat: replace :: with @ (it will take a while!)
 tic;
 ratings_dat = fopen('ratings.dat');
 ratings = textscan(ratings_dat, '%d %d %d %d', 'delimiter', '@');
 [r_userids r_movieids r_ratings r_timestamps] = ratings{:};
 r_num = length(ratings{1});
 disp(['Loaded ratings.dat in ',num2str(toc),' sec']);
 % ratings: {userid movieid rating timestamp}


 % users.dat: replace :: with @, then replace - with nothing
 tic;
 users_dat = fopen('users.dat');
 users = textscan(users_dat, '%d %c %d %d %s', 'delimiter', '@');
 [u_ids u_genders u_ages u_occupations u_zipcodes] = users{:};
 u_num = length(users{1});
 disp(['Loaded users.dat in ',num2str(toc),' sec']);
 % users: {id gender age occupation zipcode}

 fclose('all');

 % Now begin the training process


 % Build a cell array of all genres
 % Also simultaneously generate the movies-genres matrix
 tic;
 genres = {};
 mat_movies_genres = zeros(1, 1);
 for movidx=1:m_num
    % Split the genres of this movie into a cell array of individual genres
    this_genres = regexp(m_genres{movidx},'[|]','split');
    this_genres_num = length(this_genres);
    
    for m_genre_idx=1:this_genres_num
        if isempty(find(ismember(genres,this_genres(m_genre_idx))==1, 1))
            % Genre hadn't yet been found; add it to genres
            % To concatenate:  genres = [genres 'Blah'];
            genres = [genres this_genres(m_genre_idx)];
            genre_idx = length(genres);
        else
            genre_idx = find(ismember(genres,this_genres(m_genre_idx))==1, 1);
        end
        mat_movies_genres(movidx,genre_idx) = 1;
    end
 end
 genres_num = length(genres);
 disp(['Generated movies-genres matrix in ',num2str(toc),' sec']);



 % Build the genres-[user attributes] matrices
 tic;
 mat_genres_genders = zeros(genres_num, 2, 2);
 mat_genres_ages = zeros(genres_num, 1, 2);
 mat_genres_occupations = zeros(genres_num, 1, 2);
 % (ignore zipcodes, it's mostly irrelevant)
 % The third dimension is used to store sum, num respectively; then
 % sum/num = avg which is what we want.

 % TODO: the loop

 disp(['Generated genres-userattribs matrices in ',num2str(toc),' sec']);
	clear all;

	% load the "MLtraining" matrix
	load('Examtraining.mat');

	% movies.dat: replace :: with @
	tic;
	movies_dat = fopen('movies.dat');
	movies = textscan(movies_dat, '%d %s %s', 'delimiter', '@');
	[m_ids m_names m_genres] = movies{:};
	m_num = length(movies{1});
	disp(['Loaded movies.dat in ',num2str(toc),' sec']);
	% movies: {id name genres}

	% ratings.dat: replace :: with @ (it will take a while!)
	tic;
	ratings_dat = fopen('ratings.dat');
	ratings = textscan(ratings_dat, '%d %d %d %d', 'delimiter', '@');
	[r_userids r_movieids r_ratings r_timestamps] = ratings{:};
	r_num = length(ratings{1});
	disp(['Loaded ratings.dat in ',num2str(toc),' sec']);
	% ratings: {userid movieid rating timestamp}


	% users.dat: replace :: with @, then replace - with nothing
	tic;
	users_dat = fopen('users.dat');
	users = textscan(users_dat, '%d %c %d %d %s', 'delimiter', '@');
	[u_ids u_genders u_ages u_occupations u_zipcodes] = users{:};
	u_num = length(users{1});
	disp(['Loaded users.dat in ',num2str(toc),' sec']);
	% users: {id gender age occupation zipcode}

	fclose('all');

	% Now begin the training process


	% Build a cell array of all genres
	% Also simultaneously generate the movies-genres matrix
	tic;
	genres = {};
	mat_movies_genres = zeros(1, 1);
	for movidx=1:m_num
	% Split the genres of this movie into a cell array of individual genres
	this_genres = regexp(m_genres{movidx},'[\|]','split');
	this_genres_num = length(this_genres);

	for m_genre_idx=1:this_genres_num
	if isempty(find(ismember(genres,this_genres(m_genre_idx))==1, 1))
	% Genre hadn't yet been found; add it to genres
	% To concatenate: genres = [genres 'Blah'];
	genres = [genres this_genres(m_genre_idx)];
	genre_idx = length(genres);
	else
	genre_idx = find(ismember(genres,this_genres(m_genre_idx))==1, 1);
	end
	mat_movies_genres(movidx,genre_idx) = 1;
	end
	end
	genres_num = length(genres);
	disp(['Generated movies-genres matrix in ',num2str(toc),' sec']);



	% Build the genres-[user attributes] matrices
	tic;
	mat_genres_genders = zeros(genres_num, 2, 2);
	mat_genres_ages = zeros(genres_num, 1, 2);
	mat_genres_occupations = zeros(genres_num, 1, 2);
	% (ignore zipcodes, it's mostly irrelevant)
	% The third dimension is used to store sum, num respectively; then
	% sum/num = avg which is what we want.

	% TODO: the loop

	disp(['Generated genres-userattribs matrices in ',num2str(toc),' sec']);