jameslyons · October 23, 2015 05:00
diff --git a/create_currennt_HDF5.m b/create_currennt_HDF5.m
 close all;
 clear all;

 % load the protein dataset, this is a array of struct that has fields .pssm and .ss
 % which are our features and labels respectively. pssm is a L by 20 array of features.
 % ss is a L by 1 array of characters C, E, H or X which are our 4 classes.
 % the actual variable that appears is called 'combined'
 load protein_dataset;
 % this is the name of the HDF5 file we will be writing to 
 fname = ['proteins_train_currennt.nc'];

 % compute the total length of all the proteins combined
 N = 0;
 for i = 1:length(combined)
    N = N + size(combined(i).pssm,1);
 end

 D=20; % the number of dimensions
 input = zeros(D,N);  %our array of features (all proteins combined)
 target = zeros(N,1); %our array of labels
 upto = 1; % this keeps track of where we are in the input array
 seqlens = []; % array of sequence lengths
 for i = 1:length(combined)
    L = size(combined(i).pssm,1);
    input(:,upto:upto+L-1) = combined(i).pssm';  
    temp = combined(i).ss';
    % our labels are characters, they have to be converted to integers 0-3
    temp(temp=='C') = 0;
    temp(temp=='E') = 1;
    temp(temp=='H') = 2;
    temp(temp=='X') = 3;
    seqlens(i) = L;
    target(upto:upto+L-1) = temp;
    upto = upto + L;
 end
    
 % now we actually start writing the HDF5 file using MATLABS built in functionality
 ncid = netcdf.create(fname,'64BIT_OFFSET');
 dimid1 = netcdf.defDim(ncid,'numTimesteps',N);
 dimid2 = netcdf.defDim(ncid,'inputPattSize',size(inputs,1));
 dimid3 = netcdf.defDim(ncid,'numSeqs',i);
 dimid4 = netcdf.defDim(ncid,'numLabels',4);
 dimid5 = netcdf.defDim(ncid,'maxLabelLength',1);
 dimid6 = netcdf.defDim(ncid,'maxTargStringLength',5000);
 dimid7 = netcdf.defDim(ncid,'maxSeqTagLength',800);
 dimid8 = netcdf.defDim(ncid,'one',1);
 
 varid  = netcdf.defVar(ncid,'seqTags','char',[dimid7 dimid3]);
 varid  = netcdf.defVar(ncid,'numTargetClasses','int',dimid8);
 varid1 = netcdf.defVar(ncid,'inputs','double',[dimid2 dimid1]);
 varid2 = netcdf.defVar(ncid,'seqLengths','int',dimid3);
 varid3 = netcdf.defVar(ncid,'targetClasses','double',dimid1);
 varid4 = netcdf.defVar(ncid,'labels','char',[dimid5 dimid4]);
    
 netcdf.endDef(ncid);
 netcdf.putVar(ncid,varid1,double(input));
 netcdf.putVar(ncid,varid2,seqlens);
 netcdf.putVar(ncid,varid,4);
 netcdf.putVar(ncid,varid3,int32(target));
 netcdf.putVar(ncid,varid4,['C';'E';'H';'X']);
 netcdf.close(ncid)
	close all;
	clear all;

	% load the protein dataset, this is a array of struct that has fields .pssm and .ss
	% which are our features and labels respectively. pssm is a L by 20 array of features.
	% ss is a L by 1 array of characters C, E, H or X which are our 4 classes.
	% the actual variable that appears is called 'combined'
	load protein_dataset;
	% this is the name of the HDF5 file we will be writing to
	fname = ['proteins_train_currennt.nc'];

	% compute the total length of all the proteins combined
	N = 0;
	for i = 1:length(combined)
	N = N + size(combined(i).pssm,1);
	end

	D=20; % the number of dimensions
	input = zeros(D,N); %our array of features (all proteins combined)
	target = zeros(N,1); %our array of labels
	upto = 1; % this keeps track of where we are in the input array
	seqlens = []; % array of sequence lengths
	for i = 1:length(combined)
	L = size(combined(i).pssm,1);
	input(:,upto:upto+L-1) = combined(i).pssm';
	temp = combined(i).ss';
	% our labels are characters, they have to be converted to integers 0-3
	temp(temp=='C') = 0;
	temp(temp=='E') = 1;
	temp(temp=='H') = 2;
	temp(temp=='X') = 3;
	seqlens(i) = L;
	target(upto:upto+L-1) = temp;
	upto = upto + L;
	end

	% now we actually start writing the HDF5 file using MATLABS built in functionality
	ncid = netcdf.create(fname,'64BIT_OFFSET');
	dimid1 = netcdf.defDim(ncid,'numTimesteps',N);
	dimid2 = netcdf.defDim(ncid,'inputPattSize',size(inputs,1));
	dimid3 = netcdf.defDim(ncid,'numSeqs',i);
	dimid4 = netcdf.defDim(ncid,'numLabels',4);
	dimid5 = netcdf.defDim(ncid,'maxLabelLength',1);
	dimid6 = netcdf.defDim(ncid,'maxTargStringLength',5000);
	dimid7 = netcdf.defDim(ncid,'maxSeqTagLength',800);
	dimid8 = netcdf.defDim(ncid,'one',1);

	varid = netcdf.defVar(ncid,'seqTags','char',[dimid7 dimid3]);
	varid = netcdf.defVar(ncid,'numTargetClasses','int',dimid8);
	varid1 = netcdf.defVar(ncid,'inputs','double',[dimid2 dimid1]);
	varid2 = netcdf.defVar(ncid,'seqLengths','int',dimid3);
	varid3 = netcdf.defVar(ncid,'targetClasses','double',dimid1);
	varid4 = netcdf.defVar(ncid,'labels','char',[dimid5 dimid4]);

	netcdf.endDef(ncid);
	netcdf.putVar(ncid,varid1,double(input));
	netcdf.putVar(ncid,varid2,seqlens);
	netcdf.putVar(ncid,varid,4);
	netcdf.putVar(ncid,varid3,int32(target));
	netcdf.putVar(ncid,varid4,['C';'E';'H';'X']);
	netcdf.close(ncid)