Skip to content

Instantly share code, notes, and snippets.

@treydock
Created April 1, 2021 15:56
Show Gist options
  • Save treydock/a3de2fdd2d7df6da5e332fd9a2d48dc5 to your computer and use it in GitHub Desktop.
Save treydock/a3de2fdd2d7df6da5e332fd9a2d48dc5 to your computer and use it in GitHub Desktop.
var app_ident = require('../app_ident.js');
var map_helpers = require('../map_helpers.js');
module.exports = function(config) {
var appident = app_ident(config.applicationDefn);
var getHardwareConfig = function (setting, default_val) {
var val = config.hardware;
var props = setting.split('.');
for (let i = 0; i < props.length; i++) {
if (typeof val !== 'undefined' && val.hasOwnProperty(props[i])) {
val = val[props[i]];
} else {
val = default_val;
break;
}
}
return val;
};
var getProcInfo = function (job) {
var app = null;
if (job.procDump && job.procDump.constrained) {
app = appident(job.procDump.constrained);
if (!app) {
app = appident(job.procDump.unconstrained);
}
if (!app) {
if (job.procDump.constrained.length > 0) {
return {
executable: job.procDump.constrained[0],
name: 'uncategorized'
};
}
if (job.procDump.unconstrained.length > 0) {
return {
executable: job.procDump.unconstrained[0],
name: 'uncategorized'
};
}
}
}
return app;
};
return {
id: config.resource_id,
name: config.resource,
long_name: config.resource,
gpfs: config.hardware.gpfs,
nodes: 0,
ppn: 0,
start_date: "1970-01-01",
// The summary documents use compression where the various statistics
// for a metric are ommitted if they have default values. The avg
// metric is always provided so that is is possible to determine
// whether the statistic is mssing due to it being a default value.
"getcov": function(job, metricname) {
if (Array.isArray(metricname)) {
for (var i = 0; i < metricname.length; i++) {
var res = this.getcov.call(this, job, metricname[i]);
if (res.error === 0) {
return res;
}
}
return {
value: null,
error: 2
};
}
var cov = this.ref(job, metricname + ".cov");
if (cov.error === 0) {
return cov;
}
var avg = this.ref(job, metricname + ".avg");
if (avg.error === 0) {
// Avg is present but cov absent, therefore cov is default value of 0.0
return {
value: 0.0,
error: 0
};
}
return {
value: null,
error: cov.error
};
},
"getmax": function(job, metricname) {
if (Array.isArray(metricname)) {
for (var i = 0; i < metricname.length; i++) {
var res = this.getmax(job, metricname[i]);
if (res.error === 0) {
return res;
}
}
return {
value: null,
error: 2
};
}
var maxval = this.ref(job, metricname + ".max");
if (maxval.error === 0) {
return maxval;
}
var avg = this.ref(job, metricname + ".avg");
if (avg.error === 0) {
// Avg is present but max absent, therefore max is same as avg
return avg;
}
return {
value: null,
error: maxval.error
};
},
"devices": {
"block_sda": {
"name": "/sda",
"bytes_per_sector": 512
},
"netdrv_isilon": {
"name": "ifs"
},
"netdrv_panasas": {
"name": "panfs"
},
"net_eth0": {
"name": "em1"
},
"net_ib0": {
"name": "ib0"
}
},
"attributes": {
"local_job_id": {
ref: "acct.id"
},
"name": {
ref: "acct.jobname"
},
"resource_name": {
formula: function() {
return {value: this.name, error: 0};
}
},
"resource_id": {
formula: function() {
return {value: this.id, error: 0};
}
},
"organization_id": {
value: 1
},
"account": {
ref: "acct.account"
},
"username": {
ref: "acct.user"
},
"cwd": {
error: 2
},
executable: {
formula: function (job) {
var app = getProcInfo(job);
if (app) {
return {
value: app.executable,
error: 0
};
}
return {
value: null,
error: this.metricErrors.codes.metricMissingUnknownReason.value
};
}
},
application: {
formula: function (job) {
var app = getProcInfo(job);
if (app) {
return {
value: app.name,
error: 0
};
}
return {
value: null,
error: this.metricErrors.codes.metricMissingUnknownReason.value
};
}
},
"exit_status": {
formula: function(job) {
var exit = this.ref(job, "acct.exit_status");
if (exit.error === 0 && exit.value) {
exit.value = exit.value.split(" ")[0];
}
return exit;
}
},
"datasource": {
value: "prometheus"
},
"granted_pe": {
ref: "acct.ncpus"
},
"queue": {
ref: "acct.partition"
},
"requested_nodes": {
ref: "acct.nodes"
},
"hosts": {
ref: "acct.host_list",
required: true
},
"nodes": {
ref: "acct.nodes",
required: true
},
"shared": {
formula: function(job) {
if (job.hasOwnProperty("shared")) {
return {
value: job.shared ? 1 : 0,
error: 0
};
} else {
return {
value: 0,
error: 0
};
}
}
},
"cores": {
ref: "acct.ncpus",
required: true
},
"cores_avail": {
formula: function(job) {
if (job.summarization.complete && job.hasOwnProperty("cpu") && job.cpu.hasOwnProperty("nodecpus") && ! job.cpu.nodecpus.hasOwnProperty("error")) {
return this.ref(job, "cpu.nodecpus.all.cnt");
} else {
return {
value: 0,
error: this.metricErrors.codes.missingCollectionFailed.value
};
}
}
},
"submit_time_ts": {
ref: "acct.submit",
required: true
},
"eligible_time_ts": {
ref: "acct.eligible"
},
"start_time_ts": {
ref: "acct.start_time",
required: true
},
"end_time_ts": {
ref: "acct.end_time",
required: true
},
"wall_time": {
formula: function(job) {
var end_time = this.ref(job, this.attributes.end_time_ts.ref);
var start_time = this.ref(job, this.attributes.start_time_ts.ref);
var combined_error = end_time.error | start_time.error;
if (end_time.value === undefined || start_time.value === undefined) {
return {
value: null,
error: combined_error
};
}
return {
value: end_time.value - start_time.value,
error: combined_error
};
},
required: true
},
"requested_wall_time": {
formula: function(job) {
var timelimit = this.ref(job, "acct.timelimit");
if (timelimit.error !== 0 || timelimit.value === null) {
return {
value: null,
error: 2
};
}
if (typeof timelimit.value === "number") {
return {
value: timelimit.value,
error: 0
};
}
var result = timelimit.value.match(/^(?:([0-9]+)-)?([0-9]{2}):([0-9]{2}):([0-9]{2})$/);
if (result) {
if (result[1]) {
return {
value: (24 * 3600 * result[1]) + (3600 * result[2]) + (60 * result[3]) + (1 * result[4]),
error: 0
};
} else {
return {
value: (3600 * result[2]) + (60 * result[3]) + (1 * result[4]),
error: 0
};
}
} else {
return {
value: null,
error: 2
};
}
}
},
"wait_time": {
formula: function(job) {
var start_time = this.ref(job, this.attributes.start_time_ts.ref);
var submit_time = this.ref(job, this.attributes.submit_time_ts.ref);
var combined_error = start_time.error | submit_time.error;
if (start_time.value === undefined || submit_time.value === undefined) {
return {
value: null,
error: combined_error
};
}
return {
value: start_time.value - submit_time.value,
error: combined_error
};
},
required: true
},
"cpu_time": {
formula: function(job) {
var wall_time = this.attributes.wall_time.formula.call(this, job);
var num_cores = this.ref(job, this.attributes.cores.ref);
var combined_error = wall_time.error | num_cores.error;
if (wall_time.value === undefined || num_cores.value === undefined) {
return {
value: null,
error: combined_error
};
}
return {
value: wall_time.value * num_cores.value,
error: combined_error
};
},
required: true
},
"node_time": {
formula: function(job) {
var wall_time = this.attributes.wall_time.formula.call(this, job);
var num_nodes = this.ref(job, this.attributes.nodes.ref);
var combined_error = wall_time.error | num_nodes.error;
if (wall_time.value === undefined || num_nodes.value === undefined) {
return {
value: null,
error: combined_error
};
}
return {
value: wall_time.value * num_nodes.value,
error: combined_error
};
},
required: true
},
"cpu_idle": {
ref: ["cpu.cgroup.idle.avg", "cpu.jobcpus.idle.avg", "cpu.nodecpus.idle.avg"]
},
"cpu_system": {
ref: ["cpu.cgroup.system.avg", "cpu.jobcpus.system.avg", "cpu.nodecpus.system.avg"]
},
"cpu_user": {
ref: ["cpu.cgroup.user.avg", "cpu.jobcpus.user.avg", "cpu.nodecpus.user.avg"]
},
"error": {
error: 2
},
"flops": {
ref: "cpuperf.flops.avg"
},
"flops_cov": {
formula: function(job) {
return this.getcov.call(this, job, "cpuperf.flops");
}
},
"cpiref": {
ref: "cpuperf.cpiref.avg"
},
"cpiref_cov": {
formula: function(job) {
return this.getcov.call(this, job, "cpuperf.cpiref");
}
},
catastrophe: {
formula: function (job) {
var result = {
value: null,
error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value
};
if (job.catastrophe) {
if (job.catastrophe.error) {
switch (job.catastrophe.error) {
case 1:
result.error = this.metricErrors.codes.metricDisabledByUser.value;
break;
case 2:
result.error = this.metricErrors.codes.metricInsufficientData.value;
break;
case 6:
result.error = this.metricErrors.codes.metricCounterRollover.value;
break;
default:
result.error = this.metricErrors.codes.metricMissingUnknownReason.value;
break;
}
} else if (Number.isNaN(job.catastrophe.value)) {
result.error = this.metricErrors.codes.metricSummarizationError.value;
} else {
result.value = job.catastrophe.value;
result.error = 0;
}
}
return result;
}
},
"cpldref": {
ref: "cpuperf.cpldref.avg"
},
"cpldref_cov": {
formula: function(job) {
return this.getcov.call(this, job, "cpuperf.cpldref");
}
},
"mem_transferred": {
ref: "uncperf.membw.avg"
},
"mem_transferred_cov": {
formula: function(job) {
return this.getcov.call(this, job, "uncperf.membw");
}
},
"cpu_user_imbalance": {
formula: function(job) {
var cpu_count = this.ref(job, ["cpu.jobcpus.user.cnt", "cpu.nodecpus.user.cnt"]);
var cpu_user_min = this.ref(job, ["cpu.cgroup.user.min", "cpu.jobcpus.user.min", "cpu.nodecpus.user.min"]);
var cpu_user_max = this.ref(job, ["cpu.cgroup.user.max", "cpu.jobcpus.user.max", "cpu.nodecpus.user.max"]);
var error = cpu_user_min.error | cpu_user_max.error | cpu_count.error;
if (error === 0) {
if (cpu_count.value <= 1) {
return {
value: 0.0,
error: error
};
} else {
return {
value: 100.0 * (cpu_user_max.value - cpu_user_min.value) / cpu_user_max.value,
error: error
};
}
} else {
return {
value: null,
error: error
};
}
}
},
"cpu_user_cv": {
formula: function(job) {
return this.getcov.call(this, job, ["cpu.cgroup.user", "cpu.jobcpus.user", "cpu.nodecpus.user"]);
}
},
node_cpu_idle: {
ref: 'cpu.nodecpus.idle.avg'
},
energy: {
ref: 'ipmi.energy.avg'
},
max_power: {
formula: function (job) {
return this.getmax(job, 'ipmi.power.max');
}
},
"memory_used": {
formula: function(job) {
var mem = this.ref(job, "memory.used_minus_cache.avg");
if (mem.error === 0) {
return {
value: mem.value * 1024.0,
error: 0
};
}
return {
value: null,
error: mem.error
};
}
},
"memory_used_cov": {
formula: function(job) {
return this.getcov.call(this, job, "memory.used_minus_cache");
}
},
"max_memory": {
formula: function(job) {
return this.getmax(job, 'process_memory.usageratio.max');
}
},
"mem_used_including_os_caches": {
formula: function(job) {
var mem = this.ref(job, "memory.used.avg");
if (mem.error === 0) {
return {
value: mem.value * 1024.0,
error: 0
};
}
return {
value: null,
error: mem.error
};
}
},
"mem_used_including_os_caches_cov": {
formula: function(job) {
return this.getcov.call(this, job, "memory.used");
}
},
"ib_rx_bytes": map_helpers.device('infiniband', 'all', 'switch-out-bytes'),
block_sda_wr_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write'),
block_sda_wr_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write_bytes'),
block_sda_wr_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'write_bytes'),
block_sda_rd_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read'),
block_sda_rd_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read_bytes'),
block_sda_rd_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'read_bytes'),
netdrv_gpfs_rx: map_helpers.device('gpfs', 'all', 'read_bytes'),
netdrv_gpfs_rx_cov: map_helpers.device_cov('gpfs', 'all', 'read_bytes'),
netdrv_gpfs_rx_msgs: map_helpers.device('gpfs', 'all', 'reads'),
netdrv_gpfs_tx: map_helpers.device('gpfs', 'all', 'write_bytes'),
netdrv_gpfs_tx_cov: map_helpers.device_cov('gpfs', 'all', 'write_bytes'),
netdrv_gpfs_tx_msgs: map_helpers.device('gpfs', 'all', 'writes'),
"netdrv_isilon_rx": {
error: 2
},
"netdrv_isilon_rx_cov": {
error: 2
},
"netdrv_isilon_rx_msgs": {
error: 2
},
"netdrv_isilon_tx": {
error: 2
},
"netdrv_isilon_tx_cov": {
error: 2
},
"netdrv_isilon_tx_msgs": {
error: 2
},
"netdrv_panasas_rx": {
error: 2
},
"netdrv_panasas_rx_cov": {
error: 2
},
"netdrv_panasas_rx_msgs": {
error: 2
},
"netdrv_panasas_tx": {
error: 2
},
"netdrv_panasas_tx_cov": {
error: 2
},
"netdrv_panasas_tx_msgs": {
error: 2
},
netdir_home_read: {
formula: function (job) {
if (!job.nfs) {
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
}
var read = 0.0;
for (let mount in job.nfs) {
if (job.nfs.hasOwnProperty(mount)) {
if (job.nfs[mount].read && job.nfs[mount].read.avg) {
read += job.nfs[mount].read.avg
}
}
}
return { value: read, error: 0 };
}
},
netdir_home_write: {
formula: function (job) {
if (!job.nfs) {
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
}
var write = 0.0;
for (let mount in job.nfs) {
if (job.nfs.hasOwnProperty(mount)) {
if (job.nfs[mount].write && job.nfs[mount].write.avg) {
write += job.nfs[mount].write.avg
}
}
}
return { value: write, error: 0 };
}
},
netdir_projects_read: map_helpers.sum(
['nfs', getHardwareConfig('mounts.projects', '/projects')],
['read']
),
netdir_projects_write: map_helpers.sum(
['nfs', getHardwareConfig('mounts.projects', '/projects')],
['write']
),
netdir_util_read: map_helpers.sum(
['nfs', getHardwareConfig('mounts.util', '/util')],
['read']
),
netdir_util_write: map_helpers.sum(
['nfs', getHardwareConfig('mounts.util', '/util')],
['write']
),
net_eth0_rx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']),
net_eth0_rx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']),
net_eth0_rx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-packets', ['lo', 'ib0', 'ib1']),
net_eth0_tx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']),
net_eth0_tx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']),
net_eth0_tx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-packets', ['lo', 'ib0', 'ib1']),
"net_ib0_rx": {
ref: "network.ib0.in-bytes.avg"
},
"net_ib0_rx_packets": {
ref: "network.ib0.in-packets.avg"
},
"net_ib0_tx": {
ref: "network.ib0.out-bytes.avg"
},
"net_ib0_tx_packets": {
ref: "network.ib0.out-packets.avg"
},
gpu_energy: {
formula: function (job) {
if (!job.gpupower) {
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
}
var energy = 0.0;
var device_count = 0;
for (let gpu in job.gpupower) {
if (job.gpupower.hasOwnProperty(gpu)) {
if (job.gpupower[gpu].energy && job.gpupower[gpu].energy.avg) {
energy += job.gpupower[gpu].energy.avg;
device_count += 1;
}
}
}
if (device_count === 0) {
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
}
return { value: energy, error: 0 };
}
},
gpu_max_power: {
formula: function (job) {
if (!job.gpupower) {
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
}
var max_power = 0.0;
var device_count = 0;
for (let gpu in job.gpupower) {
if (job.gpupower.hasOwnProperty(gpu)) {
if (job.gpupower[gpu].power && job.gpupower[gpu].power.max) {
if (job.gpupower[gpu].power.max.max) {
max_power = Math.max(max_power, job.gpupower[gpu].power.max.max);
device_count += 1;
} else if (job.gpupower[gpu].power.max.avg) {
max_power = Math.max(max_power, job.gpupower[gpu].power.max.avg);
device_count += 1;
}
}
}
}
if (device_count === 0) {
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
}
return { value: max_power, error: 0 };
}
},
"gpu0_nv_mem_used": {
formula: function(job) {
if (!job.gpu) {
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
}
var job_gpus = this.ref(job, "acct.gpus");
if (job_gpus.value === undefined || job_gpus.value === 0) {
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
}
var util = 0.0;
for (let gpu in job.gpu) {
if (job.gpu.hasOwnProperty(gpu)) {
if (job.gpu[gpu].memused && job.gpu[gpu].memused.avg) {
util += job.gpu[gpu].memused.avg;
}
}
}
return { value: util / job_gpus.value, error: 0 };
}
},
"gpu0_nv_utilization": {
formula: function(job) {
if (!job.gpu) {
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value };
}
var job_gpus = this.ref(job, "acct.gpus");
if (job_gpus.value === undefined || job_gpus.value === 0) {
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value };
}
var util = 0.0;
for (let gpu in job.gpu) {
if (job.gpu.hasOwnProperty(gpu)) {
if (job.gpu[gpu].util && job.gpu[gpu].util.avg) {
util += job.gpu[gpu].util.avg;
}
}
}
return { value: util / job_gpus.value / 100.0, error: 0 };
}
}
}
};
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment