Created
October 23, 2015 05:00
-
-
Save jameslyons/c203e0eba13c1ef7062c to your computer and use it in GitHub Desktop.
create HDF5 file for currennt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
close all; | |
clear all; | |
% load the protein dataset, this is a array of struct that has fields .pssm and .ss | |
% which are our features and labels respectively. pssm is a L by 20 array of features. | |
% ss is a L by 1 array of characters C, E, H or X which are our 4 classes. | |
% the actual variable that appears is called 'combined' | |
load protein_dataset; | |
% this is the name of the HDF5 file we will be writing to | |
fname = ['proteins_train_currennt.nc']; | |
% compute the total length of all the proteins combined | |
N = 0; | |
for i = 1:length(combined) | |
N = N + size(combined(i).pssm,1); | |
end | |
D=20; % the number of dimensions | |
input = zeros(D,N); %our array of features (all proteins combined) | |
target = zeros(N,1); %our array of labels | |
upto = 1; % this keeps track of where we are in the input array | |
seqlens = []; % array of sequence lengths | |
for i = 1:length(combined) | |
L = size(combined(i).pssm,1); | |
input(:,upto:upto+L-1) = combined(i).pssm'; | |
temp = combined(i).ss'; | |
% our labels are characters, they have to be converted to integers 0-3 | |
temp(temp=='C') = 0; | |
temp(temp=='E') = 1; | |
temp(temp=='H') = 2; | |
temp(temp=='X') = 3; | |
seqlens(i) = L; | |
target(upto:upto+L-1) = temp; | |
upto = upto + L; | |
end | |
% now we actually start writing the HDF5 file using MATLABS built in functionality | |
ncid = netcdf.create(fname,'64BIT_OFFSET'); | |
dimid1 = netcdf.defDim(ncid,'numTimesteps',N); | |
dimid2 = netcdf.defDim(ncid,'inputPattSize',size(inputs,1)); | |
dimid3 = netcdf.defDim(ncid,'numSeqs',i); | |
dimid4 = netcdf.defDim(ncid,'numLabels',4); | |
dimid5 = netcdf.defDim(ncid,'maxLabelLength',1); | |
dimid6 = netcdf.defDim(ncid,'maxTargStringLength',5000); | |
dimid7 = netcdf.defDim(ncid,'maxSeqTagLength',800); | |
dimid8 = netcdf.defDim(ncid,'one',1); | |
varid = netcdf.defVar(ncid,'seqTags','char',[dimid7 dimid3]); | |
varid = netcdf.defVar(ncid,'numTargetClasses','int',dimid8); | |
varid1 = netcdf.defVar(ncid,'inputs','double',[dimid2 dimid1]); | |
varid2 = netcdf.defVar(ncid,'seqLengths','int',dimid3); | |
varid3 = netcdf.defVar(ncid,'targetClasses','double',dimid1); | |
varid4 = netcdf.defVar(ncid,'labels','char',[dimid5 dimid4]); | |
netcdf.endDef(ncid); | |
netcdf.putVar(ncid,varid1,double(input)); | |
netcdf.putVar(ncid,varid2,seqlens); | |
netcdf.putVar(ncid,varid,4); | |
netcdf.putVar(ncid,varid3,int32(target)); | |
netcdf.putVar(ncid,varid4,['C';'E';'H';'X']); | |
netcdf.close(ncid) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment