Last active
October 7, 2020 13:18
-
-
Save decabyte/5101689 to your computer and use it in GitHub Desktop.
ARFF reader/writer for MATLAB -- http://decabyte.it/projects/arff-reader-writer-for-matlab/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% ARFF_READ - Read content of an ARFF file to a MATLAB's struct array. | |
% | |
% [DATA, relname, nomspec] = ARFF_READ(arff_file) | |
% arff_file => input file (.arff / .arff.gz extension) | |
% relname => relation name (string) | |
% DATA => struct array representing data and attributes (n x attrs) | |
% nomspec => struct array defining nominal-specification attributes | |
% | |
% NOTES: | |
% See ARFF_WRITE to read notes about relname and nomspec. | |
% See ARFF format specification on WEKA site. | |
function [data, relname, nomspec] = arff_read(arff_file) | |
if nargin < 1 | |
error('MATLAB:input','Not enough inputs!'); | |
end | |
if isempty(arff_file) | |
error('MATLAB:input','Bad file name!'); | |
end | |
% check file extention | |
[~, ~, ext] = fileparts(arff_file); | |
if strcmpi(ext,'.arff') | |
% open file | |
fid = fopen(arff_file, 'r+t'); | |
elseif strcmpi(ext,'.gz') | |
% temporary working dir | |
outdir = tempdir; | |
% decompress | |
dec_files = gunzip(arff_file, outdir); | |
if ~isempty(dec_files) | |
fid = fopen(dec_files{1}, 'r+t'); | |
else | |
error('%s is not a valid arff_file', arff_file); | |
end | |
else | |
error('%s is not a valid arff_file', arff_file); | |
end | |
if fid == -1 | |
error('MATLAB:file','File not found!'); | |
end | |
% read relname | |
relname = []; | |
while isempty(relname) | |
tline = fgetl(fid); | |
if ~ischar(tline) | |
fclose(fid); | |
error('MATLAB:file','ARFF file not recognized!'); | |
end | |
% avoid parsing @DATA and skip blank lines | |
if length(tline) > 9 && tline(1) == '@' && strcmpi(tline(2:9),'RELATION') | |
relname = tline(11:end); | |
break; | |
end | |
end | |
% read attributes | |
fields = {}; | |
ftypes = []; | |
floop = 1; | |
fn = 1; | |
while floop | |
tline = fgetl(fid); | |
if ~ischar(tline) | |
break; | |
end | |
% avoid parsing @DATA and skip blank lines | |
if length(tline) > 5 && tline(1) == '@' && strcmpi(tline(2:10),'ATTRIBUTE') | |
%at = strfind(tline, ' '); | |
% | |
%if length(at) < 2 | |
% error('MATLAB:file','ARFF file not recognized!'); | |
%end | |
% | |
%fields{fn} = tline(at(1)+1:at(2)-1); | |
%typedef = tline(at(2)+1:end); | |
% parsing using textscan? (good for data, less for attributes) | |
A = textscan(tline,'%s %s %s','Whitespace',' \t\b{},'); | |
if isempty(A{1}) || isempty(A{2}) || isempty(A{3}) | |
fclose(fid); | |
error('MATLAB:file','ARFF file not recognized!'); | |
end | |
if size(A{1},1) == 1 | |
fields{fn} = char(A{2}); | |
typedef = char(A{3}); | |
else | |
fields{fn} = char(A{2}(1)); | |
bt = strfind(tline,'{'); | |
typedef = tline(bt(1):end); | |
end | |
if typedef(1) == '{' && typedef(end) == '}' | |
ftypes(fn) = 1; | |
%nomspec.(fields{fn}) = typedef; | |
% out is a cell with parsed classes assuming { x, x, x } format | |
out = textscan(typedef, '%s', 'Delimiter', ' ,{}', 'MultipleDelimsAsOne', 1); | |
% expand cell (avoid cell of cell) | |
nomspec.(fields{fn}) = out{:}; | |
else | |
if strcmpi(typedef,'NUMERIC') | |
ftypes(fn) = 0; | |
elseif strcmpi(typedef,'STRING') | |
ftypes(fn) = 2; | |
else | |
dt = strfind(typedef, ' '); | |
if ~isempty(dt) && strcmpi(typedef(1:dt(1)-1), 'DATE') | |
ftypes(fn) = 3; | |
% implement date-format parsing | |
else | |
fclose(fid); | |
error('MATLAB:file','ARFF file not recognized!'); | |
end | |
end | |
end | |
fn = fn + 1; | |
end | |
end | |
% create data struct | |
data = struct(); | |
for fn = 1 : length(fields) | |
data.(fields{fn}) = []; | |
end | |
% store empty struct | |
data_tmpl = data; | |
% rewind file | |
fseek(fid,0,-1); | |
% seek data | |
has_data = 0; | |
while floop | |
tline = fgetl(fid); | |
if length(tline) == 5 && strcmpi(tline(1:5),'@DATA') | |
has_data = 1; | |
break; | |
end | |
if ~ischar(tline) | |
break; | |
end | |
end | |
if has_data == 1 | |
dcnt = 1; | |
while floop | |
tline = fgetl(fid); | |
if length(tline) > 1 | |
% find values | |
vt = strfind(tline,','); | |
% init with empty struct | |
data(dcnt) = data_tmpl; | |
for k = 1 : length(vt) + 1 | |
if k == 1 | |
if isempty(vt) | |
content = tline(1:end); | |
else | |
content = tline(1:vt(k)-1); | |
end | |
elseif k <= length(vt) | |
content = tline(vt(k-1)+1:vt(k)-1); | |
else | |
content = tline(vt(k-1)+1:end); | |
end | |
switch ftypes(k) | |
case 0 | |
data(dcnt).(fields{k}) = str2double( content ); %str2num( content ); | |
case 3 | |
data(dcnt).(fields{k}) = datenum( content(2:end-1), 'yyyy-mm-dd HH:MM:SS' ); | |
otherwise | |
data(dcnt).(fields{k}) = content; | |
end | |
end | |
dcnt = dcnt + 1; | |
end | |
if ~ischar(tline) | |
break; | |
end | |
end | |
end | |
% close file | |
fclose(fid); | |
% remove temporary decompressed file | |
if exist('dec_files','var') && ~isempty(dec_files) | |
delete(dec_files{1}); | |
end | |
end | |
% References: | |
% [1]: http://www.cs.waikato.ac.nz/ml/weka/arff.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% ARFF_WRITE - Saves a MATLAB's struct array to file using ARFF file format. | |
% | |
% ARFF_WRITE(arff_file, DATA, relname, nomspec) | |
% arff_file => output file (.arff / .arff.gz extension) | |
% DATA => struct array representing data and attributes (n x attrs) | |
% relname => relation name (string) | |
% nomspec => struct array defining nominal-specification attributes | |
% | |
% NOTES: | |
% Attribute name is taken from DATA struct fieldname and attribute | |
% type is taken from field data-type. | |
% | |
% Append "_class" to a DATA struct fieldname to save an attribute as | |
% nominal-specification attribute and specify the nominal-names | |
% inside NOMSPEC struct array using as fieldname the DATA struct's | |
% fieldname and as content a cell array of names (string). | |
% | |
% Append "_date" to a DATA struct fieldname and use numerical date | |
% representation (using datenum) to save an attribute as date type | |
% (using 'yyyy-mm-dd HH:MM:SS' format in ARFF file). | |
% | |
% TODO -- According to SPEC any attribute that contain space must be | |
% quoted using single quote char. | |
% | |
% See ARFF format specification on WEKA site. | |
function [] = arff_write(arff_file, data, relname, nomspec) | |
if nargin < 3 | |
error('MATLAB:input','Not enough inputs!'); | |
end | |
if isempty(data) || ~isstruct(data) | |
error('MATLAB:input','Please use struct data input!'); | |
end | |
if isempty(arff_file) | |
arff_file = sprintf('output-%d.arff', randi(1000,1)); | |
end | |
if isempty(relname) | |
relname = sprintf('relname-%d', randi(1000,1)); | |
end | |
% check file extention | |
[arff_path, arff_name, ext] = fileparts(arff_file); | |
if strcmpi(ext,'.arff') | |
% open file | |
fid = fopen(arff_file, 'w+t'); | |
elseif strcmpi(ext,'.gz') | |
% temp file | |
outfile = fullfile(tempdir, arff_name); | |
% open file | |
fid = fopen(outfile, 'w+t'); | |
else | |
error('%s is not a valid arff_file', arff_file); | |
end | |
% write relname | |
fprintf(fid, '@RELATION %s\n\n', relname); | |
% write attributes | |
fields = fieldnames(data); | |
ftypes = zeros(size(fields)); | |
for fn = 1 : length(fields) | |
if isnumeric( data(1).(fields{fn}) ) | |
dt = strfind(fields{fn}, '_date'); | |
if isempty(dt) | |
type = 'NUMERIC'; | |
ftypes(fn) = 0; | |
else | |
% check SimpleDateFormat (java.doc) to accept this instead of ISO-8601 | |
type = 'DATE "yyyy-mm-dd HH:MM:SS"'; | |
ftypes(fn) = 3; | |
%name = fields{fn}(1:max(dt)-1); | |
end | |
elseif ischar( data(1).(fields{fn}) ) | |
ct = strfind(fields{fn}, '_class'); | |
if isempty(ct) | |
type = 'STRING'; | |
ftypes(fn) = 2; | |
else | |
if isstruct(nomspec) && isfield(nomspec, fields{fn}) && ... | |
iscell(nomspec.(fields{fn})) | |
type = '{'; | |
for k = 1 : length( nomspec.(fields{fn}) ) - 1 | |
type = sprintf( '%s %s,', type, nomspec.(fields{fn}){k} ); | |
end | |
type = sprintf('%s %s }', type, nomspec.(fields{fn}){k+1}); | |
else | |
fclose(fid); | |
error('MATLAB:input','Inferring class specification from data!'); | |
% TODO inference | |
end | |
ftypes(fn) = 1; | |
%name = fields{fn}(1:max(ct)-1); | |
end | |
else | |
fclose(fid); | |
error('MATLAB:input','Cannot convert %s field to ARFF format!', fields{fn}); | |
end | |
fprintf(fid, '@ATTRIBUTE %s %s\n', fields{fn}, type); | |
%fprintf(fid, '@ATTRIBUTE %s %s\n', name, type); | |
end | |
% write data | |
fprintf(fid, '\n@DATA\n'); | |
content = ''; | |
for n = 1 : length(data) | |
for fn = 1 : length(fields) | |
if isempty(data(n).(fields{fn})) | |
content = '?'; | |
else | |
switch ftypes(fn) | |
case 0 | |
content = num2str( data(n).(fields{fn}) ); | |
case 1 | |
content = data(n).(fields{fn}); | |
case 2 | |
content = data(n).(fields{fn}); | |
case 3 | |
content = ['"' datestr(data(n).(fields{fn}), 'yyyy-mm-dd HH:MM:SS') '"']; | |
end | |
end | |
if fn < length(fields) | |
fprintf(fid,'%s,', content); | |
else | |
fprintf(fid,'%s', content); | |
end | |
end | |
fprintf(fid,'\n'); | |
end | |
% close file | |
fclose(fid); | |
% remove temporary file & compress .arff | |
if exist('outfile','var') && ~isempty(outfile) | |
gzip(outfile, arff_path); | |
delete(outfile); | |
end | |
end | |
% References: | |
% [1]: http://www.cs.waikato.ac.nz/ml/weka/arff.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% example_read.m | |
clear all; close all; clc; | |
path(path, '..'); | |
%% import dataset | |
infile = 'example_dataset.arff'; | |
% load arff | |
[data, relname, nomspec] = arff_read(infile); | |
% extract nominal specification attribute | |
type_class = nomspec.type_class; | |
%% plot dataset | |
plot([data.idx], [data.high], 'r.-'); grid on; hold on; | |
plot([data.idx], [data.med], 'g.-'); grid on; hold on; | |
plot([data.idx], [data.low], 'b.-'); grid on; hold on; | |
m_values = mean([ data.high; data.med; data.low ]'); | |
for k = 1 : length(m_values) | |
hr = refline(0, m_values(k)); | |
set(hr,'Color','k','LineStyle','--'); | |
end | |
legend('high','med','low'); | |
tl = title(relname); | |
xl = xlabel('idx'); | |
yl = ylabel('value'); | |
set(tl,'Interpreter','none'); | |
set(tl,'FontSize', 14); | |
set(xl,'FontSize', 12); | |
set(yl,'FontSize', 12); | |
%% type histogram | |
T = {data.type_class}; | |
[B,I,J] = unique(T); | |
f = figure(); | |
hist(J,length(B)); grid on; | |
xlim([0.5 3.5]); | |
hp = findobj(f,'Type','patch'); | |
set(hp,'FaceColor','r','EdgeColor','w'); | |
% labels | |
[n,x] = hist(J,length(B)); | |
text(x, n, type_class, 'horizontalalignment', ... | |
'center', 'verticalalignment', 'bottom'); | |
tl = title(relname); | |
xl = xlabel('class'); | |
yl = ylabel('count'); | |
set(tl,'Interpreter','none'); | |
set(gca,'XTick',[]); | |
set(tl,'FontSize', 14); | |
set(xl,'FontSize', 12); | |
set(yl,'FontSize', 12); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% example_write.m | |
clear all; close all; clc; | |
path(path, '..'); | |
%% create data structure | |
data = struct(); | |
relname = sprintf('dataset_%s', datestr(now,'yyyymmdd')); | |
outfile = sprintf('%s.arff', relname); | |
% nominal classes | |
type_class = { 'front', 'middle', 'rear' }; | |
%% populate dataset | |
for i = 1 : 100 | |
data(i).idx = i; | |
data(i).low = randi([0 33], 1); | |
data(i).med = randi([34 66], 1); | |
data(i).high = randi([67 100], 1); | |
data(i).type_class = type_class{ randi([1 3]) }; | |
end | |
%% declare nominal specification attributes | |
nomspec.type_class = type_class; | |
% save arff | |
arff_write(outfile, data, relname, nomspec); | |
%% plot dataset | |
plot([data.idx], [data.high], 'r.-'); grid on; hold on; | |
plot([data.idx], [data.med], 'g.-'); grid on; hold on; | |
plot([data.idx], [data.low], 'b.-'); grid on; hold on; | |
m_values = mean([ data.high; data.med; data.low ]'); | |
for k = 1 : length(m_values) | |
hr = refline(0, m_values(k)); | |
set(hr,'Color','k','LineStyle','--'); | |
end | |
legend('high','med','low'); | |
tl = title(relname); | |
xl = xlabel('idx'); | |
yl = ylabel('value'); | |
set(tl,'Interpreter','none'); | |
set(tl,'FontSize', 14); | |
set(xl,'FontSize', 12); | |
set(yl,'FontSize', 12); | |
%% type histogram | |
T = {data.type_class}; | |
[B,I,J] = unique(T); | |
f = figure(); | |
hist(J,length(B)); grid on; | |
xlim([0.5 3.5]); | |
hp = findobj(f,'Type','patch'); | |
set(hp,'FaceColor','r','EdgeColor','w'); | |
% labels | |
[n,x] = hist(J,length(B)); | |
text(x, n, type_class, 'horizontalalignment', ... | |
'center', 'verticalalignment', 'bottom'); | |
tl = title(relname); | |
xl = xlabel('class'); | |
yl = ylabel('count'); | |
set(tl,'Interpreter','none'); | |
set(gca,'XTick',[]); | |
set(tl,'FontSize', 14); | |
set(xl,'FontSize', 12); | |
set(yl,'FontSize', 12); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment