Created
April 1, 2021 15:56
-
-
Save treydock/a3de2fdd2d7df6da5e332fd9a2d48dc5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var app_ident = require('../app_ident.js'); | |
var map_helpers = require('../map_helpers.js'); | |
module.exports = function(config) { | |
var appident = app_ident(config.applicationDefn); | |
var getHardwareConfig = function (setting, default_val) { | |
var val = config.hardware; | |
var props = setting.split('.'); | |
for (let i = 0; i < props.length; i++) { | |
if (typeof val !== 'undefined' && val.hasOwnProperty(props[i])) { | |
val = val[props[i]]; | |
} else { | |
val = default_val; | |
break; | |
} | |
} | |
return val; | |
}; | |
var getProcInfo = function (job) { | |
var app = null; | |
if (job.procDump && job.procDump.constrained) { | |
app = appident(job.procDump.constrained); | |
if (!app) { | |
app = appident(job.procDump.unconstrained); | |
} | |
if (!app) { | |
if (job.procDump.constrained.length > 0) { | |
return { | |
executable: job.procDump.constrained[0], | |
name: 'uncategorized' | |
}; | |
} | |
if (job.procDump.unconstrained.length > 0) { | |
return { | |
executable: job.procDump.unconstrained[0], | |
name: 'uncategorized' | |
}; | |
} | |
} | |
} | |
return app; | |
}; | |
return { | |
id: config.resource_id, | |
name: config.resource, | |
long_name: config.resource, | |
gpfs: config.hardware.gpfs, | |
nodes: 0, | |
ppn: 0, | |
start_date: "1970-01-01", | |
// The summary documents use compression where the various statistics | |
// for a metric are ommitted if they have default values. The avg | |
// metric is always provided so that is is possible to determine | |
// whether the statistic is mssing due to it being a default value. | |
"getcov": function(job, metricname) { | |
if (Array.isArray(metricname)) { | |
for (var i = 0; i < metricname.length; i++) { | |
var res = this.getcov.call(this, job, metricname[i]); | |
if (res.error === 0) { | |
return res; | |
} | |
} | |
return { | |
value: null, | |
error: 2 | |
}; | |
} | |
var cov = this.ref(job, metricname + ".cov"); | |
if (cov.error === 0) { | |
return cov; | |
} | |
var avg = this.ref(job, metricname + ".avg"); | |
if (avg.error === 0) { | |
// Avg is present but cov absent, therefore cov is default value of 0.0 | |
return { | |
value: 0.0, | |
error: 0 | |
}; | |
} | |
return { | |
value: null, | |
error: cov.error | |
}; | |
}, | |
"getmax": function(job, metricname) { | |
if (Array.isArray(metricname)) { | |
for (var i = 0; i < metricname.length; i++) { | |
var res = this.getmax(job, metricname[i]); | |
if (res.error === 0) { | |
return res; | |
} | |
} | |
return { | |
value: null, | |
error: 2 | |
}; | |
} | |
var maxval = this.ref(job, metricname + ".max"); | |
if (maxval.error === 0) { | |
return maxval; | |
} | |
var avg = this.ref(job, metricname + ".avg"); | |
if (avg.error === 0) { | |
// Avg is present but max absent, therefore max is same as avg | |
return avg; | |
} | |
return { | |
value: null, | |
error: maxval.error | |
}; | |
}, | |
"devices": { | |
"block_sda": { | |
"name": "/sda", | |
"bytes_per_sector": 512 | |
}, | |
"netdrv_isilon": { | |
"name": "ifs" | |
}, | |
"netdrv_panasas": { | |
"name": "panfs" | |
}, | |
"net_eth0": { | |
"name": "em1" | |
}, | |
"net_ib0": { | |
"name": "ib0" | |
} | |
}, | |
"attributes": { | |
"local_job_id": { | |
ref: "acct.id" | |
}, | |
"name": { | |
ref: "acct.jobname" | |
}, | |
"resource_name": { | |
formula: function() { | |
return {value: this.name, error: 0}; | |
} | |
}, | |
"resource_id": { | |
formula: function() { | |
return {value: this.id, error: 0}; | |
} | |
}, | |
"organization_id": { | |
value: 1 | |
}, | |
"account": { | |
ref: "acct.account" | |
}, | |
"username": { | |
ref: "acct.user" | |
}, | |
"cwd": { | |
error: 2 | |
}, | |
executable: { | |
formula: function (job) { | |
var app = getProcInfo(job); | |
if (app) { | |
return { | |
value: app.executable, | |
error: 0 | |
}; | |
} | |
return { | |
value: null, | |
error: this.metricErrors.codes.metricMissingUnknownReason.value | |
}; | |
} | |
}, | |
application: { | |
formula: function (job) { | |
var app = getProcInfo(job); | |
if (app) { | |
return { | |
value: app.name, | |
error: 0 | |
}; | |
} | |
return { | |
value: null, | |
error: this.metricErrors.codes.metricMissingUnknownReason.value | |
}; | |
} | |
}, | |
"exit_status": { | |
formula: function(job) { | |
var exit = this.ref(job, "acct.exit_status"); | |
if (exit.error === 0 && exit.value) { | |
exit.value = exit.value.split(" ")[0]; | |
} | |
return exit; | |
} | |
}, | |
"datasource": { | |
value: "prometheus" | |
}, | |
"granted_pe": { | |
ref: "acct.ncpus" | |
}, | |
"queue": { | |
ref: "acct.partition" | |
}, | |
"requested_nodes": { | |
ref: "acct.nodes" | |
}, | |
"hosts": { | |
ref: "acct.host_list", | |
required: true | |
}, | |
"nodes": { | |
ref: "acct.nodes", | |
required: true | |
}, | |
"shared": { | |
formula: function(job) { | |
if (job.hasOwnProperty("shared")) { | |
return { | |
value: job.shared ? 1 : 0, | |
error: 0 | |
}; | |
} else { | |
return { | |
value: 0, | |
error: 0 | |
}; | |
} | |
} | |
}, | |
"cores": { | |
ref: "acct.ncpus", | |
required: true | |
}, | |
"cores_avail": { | |
formula: function(job) { | |
if (job.summarization.complete && job.hasOwnProperty("cpu") && job.cpu.hasOwnProperty("nodecpus") && ! job.cpu.nodecpus.hasOwnProperty("error")) { | |
return this.ref(job, "cpu.nodecpus.all.cnt"); | |
} else { | |
return { | |
value: 0, | |
error: this.metricErrors.codes.missingCollectionFailed.value | |
}; | |
} | |
} | |
}, | |
"submit_time_ts": { | |
ref: "acct.submit", | |
required: true | |
}, | |
"eligible_time_ts": { | |
ref: "acct.eligible" | |
}, | |
"start_time_ts": { | |
ref: "acct.start_time", | |
required: true | |
}, | |
"end_time_ts": { | |
ref: "acct.end_time", | |
required: true | |
}, | |
"wall_time": { | |
formula: function(job) { | |
var end_time = this.ref(job, this.attributes.end_time_ts.ref); | |
var start_time = this.ref(job, this.attributes.start_time_ts.ref); | |
var combined_error = end_time.error | start_time.error; | |
if (end_time.value === undefined || start_time.value === undefined) { | |
return { | |
value: null, | |
error: combined_error | |
}; | |
} | |
return { | |
value: end_time.value - start_time.value, | |
error: combined_error | |
}; | |
}, | |
required: true | |
}, | |
"requested_wall_time": { | |
formula: function(job) { | |
var timelimit = this.ref(job, "acct.timelimit"); | |
if (timelimit.error !== 0 || timelimit.value === null) { | |
return { | |
value: null, | |
error: 2 | |
}; | |
} | |
if (typeof timelimit.value === "number") { | |
return { | |
value: timelimit.value, | |
error: 0 | |
}; | |
} | |
var result = timelimit.value.match(/^(?:([0-9]+)-)?([0-9]{2}):([0-9]{2}):([0-9]{2})$/); | |
if (result) { | |
if (result[1]) { | |
return { | |
value: (24 * 3600 * result[1]) + (3600 * result[2]) + (60 * result[3]) + (1 * result[4]), | |
error: 0 | |
}; | |
} else { | |
return { | |
value: (3600 * result[2]) + (60 * result[3]) + (1 * result[4]), | |
error: 0 | |
}; | |
} | |
} else { | |
return { | |
value: null, | |
error: 2 | |
}; | |
} | |
} | |
}, | |
"wait_time": { | |
formula: function(job) { | |
var start_time = this.ref(job, this.attributes.start_time_ts.ref); | |
var submit_time = this.ref(job, this.attributes.submit_time_ts.ref); | |
var combined_error = start_time.error | submit_time.error; | |
if (start_time.value === undefined || submit_time.value === undefined) { | |
return { | |
value: null, | |
error: combined_error | |
}; | |
} | |
return { | |
value: start_time.value - submit_time.value, | |
error: combined_error | |
}; | |
}, | |
required: true | |
}, | |
"cpu_time": { | |
formula: function(job) { | |
var wall_time = this.attributes.wall_time.formula.call(this, job); | |
var num_cores = this.ref(job, this.attributes.cores.ref); | |
var combined_error = wall_time.error | num_cores.error; | |
if (wall_time.value === undefined || num_cores.value === undefined) { | |
return { | |
value: null, | |
error: combined_error | |
}; | |
} | |
return { | |
value: wall_time.value * num_cores.value, | |
error: combined_error | |
}; | |
}, | |
required: true | |
}, | |
"node_time": { | |
formula: function(job) { | |
var wall_time = this.attributes.wall_time.formula.call(this, job); | |
var num_nodes = this.ref(job, this.attributes.nodes.ref); | |
var combined_error = wall_time.error | num_nodes.error; | |
if (wall_time.value === undefined || num_nodes.value === undefined) { | |
return { | |
value: null, | |
error: combined_error | |
}; | |
} | |
return { | |
value: wall_time.value * num_nodes.value, | |
error: combined_error | |
}; | |
}, | |
required: true | |
}, | |
"cpu_idle": { | |
ref: ["cpu.cgroup.idle.avg", "cpu.jobcpus.idle.avg", "cpu.nodecpus.idle.avg"] | |
}, | |
"cpu_system": { | |
ref: ["cpu.cgroup.system.avg", "cpu.jobcpus.system.avg", "cpu.nodecpus.system.avg"] | |
}, | |
"cpu_user": { | |
ref: ["cpu.cgroup.user.avg", "cpu.jobcpus.user.avg", "cpu.nodecpus.user.avg"] | |
}, | |
"error": { | |
error: 2 | |
}, | |
"flops": { | |
ref: "cpuperf.flops.avg" | |
}, | |
"flops_cov": { | |
formula: function(job) { | |
return this.getcov.call(this, job, "cpuperf.flops"); | |
} | |
}, | |
"cpiref": { | |
ref: "cpuperf.cpiref.avg" | |
}, | |
"cpiref_cov": { | |
formula: function(job) { | |
return this.getcov.call(this, job, "cpuperf.cpiref"); | |
} | |
}, | |
catastrophe: { | |
formula: function (job) { | |
var result = { | |
value: null, | |
error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value | |
}; | |
if (job.catastrophe) { | |
if (job.catastrophe.error) { | |
switch (job.catastrophe.error) { | |
case 1: | |
result.error = this.metricErrors.codes.metricDisabledByUser.value; | |
break; | |
case 2: | |
result.error = this.metricErrors.codes.metricInsufficientData.value; | |
break; | |
case 6: | |
result.error = this.metricErrors.codes.metricCounterRollover.value; | |
break; | |
default: | |
result.error = this.metricErrors.codes.metricMissingUnknownReason.value; | |
break; | |
} | |
} else if (Number.isNaN(job.catastrophe.value)) { | |
result.error = this.metricErrors.codes.metricSummarizationError.value; | |
} else { | |
result.value = job.catastrophe.value; | |
result.error = 0; | |
} | |
} | |
return result; | |
} | |
}, | |
"cpldref": { | |
ref: "cpuperf.cpldref.avg" | |
}, | |
"cpldref_cov": { | |
formula: function(job) { | |
return this.getcov.call(this, job, "cpuperf.cpldref"); | |
} | |
}, | |
"mem_transferred": { | |
ref: "uncperf.membw.avg" | |
}, | |
"mem_transferred_cov": { | |
formula: function(job) { | |
return this.getcov.call(this, job, "uncperf.membw"); | |
} | |
}, | |
"cpu_user_imbalance": { | |
formula: function(job) { | |
var cpu_count = this.ref(job, ["cpu.jobcpus.user.cnt", "cpu.nodecpus.user.cnt"]); | |
var cpu_user_min = this.ref(job, ["cpu.cgroup.user.min", "cpu.jobcpus.user.min", "cpu.nodecpus.user.min"]); | |
var cpu_user_max = this.ref(job, ["cpu.cgroup.user.max", "cpu.jobcpus.user.max", "cpu.nodecpus.user.max"]); | |
var error = cpu_user_min.error | cpu_user_max.error | cpu_count.error; | |
if (error === 0) { | |
if (cpu_count.value <= 1) { | |
return { | |
value: 0.0, | |
error: error | |
}; | |
} else { | |
return { | |
value: 100.0 * (cpu_user_max.value - cpu_user_min.value) / cpu_user_max.value, | |
error: error | |
}; | |
} | |
} else { | |
return { | |
value: null, | |
error: error | |
}; | |
} | |
} | |
}, | |
"cpu_user_cv": { | |
formula: function(job) { | |
return this.getcov.call(this, job, ["cpu.cgroup.user", "cpu.jobcpus.user", "cpu.nodecpus.user"]); | |
} | |
}, | |
node_cpu_idle: { | |
ref: 'cpu.nodecpus.idle.avg' | |
}, | |
energy: { | |
ref: 'ipmi.energy.avg' | |
}, | |
max_power: { | |
formula: function (job) { | |
return this.getmax(job, 'ipmi.power.max'); | |
} | |
}, | |
"memory_used": { | |
formula: function(job) { | |
var mem = this.ref(job, "memory.used_minus_cache.avg"); | |
if (mem.error === 0) { | |
return { | |
value: mem.value * 1024.0, | |
error: 0 | |
}; | |
} | |
return { | |
value: null, | |
error: mem.error | |
}; | |
} | |
}, | |
"memory_used_cov": { | |
formula: function(job) { | |
return this.getcov.call(this, job, "memory.used_minus_cache"); | |
} | |
}, | |
"max_memory": { | |
formula: function(job) { | |
return this.getmax(job, 'process_memory.usageratio.max'); | |
} | |
}, | |
"mem_used_including_os_caches": { | |
formula: function(job) { | |
var mem = this.ref(job, "memory.used.avg"); | |
if (mem.error === 0) { | |
return { | |
value: mem.value * 1024.0, | |
error: 0 | |
}; | |
} | |
return { | |
value: null, | |
error: mem.error | |
}; | |
} | |
}, | |
"mem_used_including_os_caches_cov": { | |
formula: function(job) { | |
return this.getcov.call(this, job, "memory.used"); | |
} | |
}, | |
"ib_rx_bytes": map_helpers.device('infiniband', 'all', 'switch-out-bytes'), | |
block_sda_wr_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write'), | |
block_sda_wr_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'write_bytes'), | |
block_sda_wr_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'write_bytes'), | |
block_sda_rd_ios: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read'), | |
block_sda_rd_bytes: map_helpers.device('block', getHardwareConfig('block', 'sda'), 'read_bytes'), | |
block_sda_rd_bytes_cov: map_helpers.device_cov('block', getHardwareConfig('block', 'sda'), 'read_bytes'), | |
netdrv_gpfs_rx: map_helpers.device('gpfs', 'all', 'read_bytes'), | |
netdrv_gpfs_rx_cov: map_helpers.device_cov('gpfs', 'all', 'read_bytes'), | |
netdrv_gpfs_rx_msgs: map_helpers.device('gpfs', 'all', 'reads'), | |
netdrv_gpfs_tx: map_helpers.device('gpfs', 'all', 'write_bytes'), | |
netdrv_gpfs_tx_cov: map_helpers.device_cov('gpfs', 'all', 'write_bytes'), | |
netdrv_gpfs_tx_msgs: map_helpers.device('gpfs', 'all', 'writes'), | |
"netdrv_isilon_rx": { | |
error: 2 | |
}, | |
"netdrv_isilon_rx_cov": { | |
error: 2 | |
}, | |
"netdrv_isilon_rx_msgs": { | |
error: 2 | |
}, | |
"netdrv_isilon_tx": { | |
error: 2 | |
}, | |
"netdrv_isilon_tx_cov": { | |
error: 2 | |
}, | |
"netdrv_isilon_tx_msgs": { | |
error: 2 | |
}, | |
"netdrv_panasas_rx": { | |
error: 2 | |
}, | |
"netdrv_panasas_rx_cov": { | |
error: 2 | |
}, | |
"netdrv_panasas_rx_msgs": { | |
error: 2 | |
}, | |
"netdrv_panasas_tx": { | |
error: 2 | |
}, | |
"netdrv_panasas_tx_cov": { | |
error: 2 | |
}, | |
"netdrv_panasas_tx_msgs": { | |
error: 2 | |
}, | |
netdir_home_read: { | |
formula: function (job) { | |
if (!job.nfs) { | |
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
} | |
var read = 0.0; | |
for (let mount in job.nfs) { | |
if (job.nfs.hasOwnProperty(mount)) { | |
if (job.nfs[mount].read && job.nfs[mount].read.avg) { | |
read += job.nfs[mount].read.avg | |
} | |
} | |
} | |
return { value: read, error: 0 }; | |
} | |
}, | |
netdir_home_write: { | |
formula: function (job) { | |
if (!job.nfs) { | |
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
} | |
var write = 0.0; | |
for (let mount in job.nfs) { | |
if (job.nfs.hasOwnProperty(mount)) { | |
if (job.nfs[mount].write && job.nfs[mount].write.avg) { | |
write += job.nfs[mount].write.avg | |
} | |
} | |
} | |
return { value: write, error: 0 }; | |
} | |
}, | |
netdir_projects_read: map_helpers.sum( | |
['nfs', getHardwareConfig('mounts.projects', '/projects')], | |
['read'] | |
), | |
netdir_projects_write: map_helpers.sum( | |
['nfs', getHardwareConfig('mounts.projects', '/projects')], | |
['write'] | |
), | |
netdir_util_read: map_helpers.sum( | |
['nfs', getHardwareConfig('mounts.util', '/util')], | |
['read'] | |
), | |
netdir_util_write: map_helpers.sum( | |
['nfs', getHardwareConfig('mounts.util', '/util')], | |
['write'] | |
), | |
net_eth0_rx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']), | |
net_eth0_rx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'in-bytes', ['lo', 'ib0', 'ib1']), | |
net_eth0_rx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'in-packets', ['lo', 'ib0', 'ib1']), | |
net_eth0_tx: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']), | |
net_eth0_tx_cov: map_helpers.device_cov('network', getHardwareConfig('network', 'em1'), 'out-bytes', ['lo', 'ib0', 'ib1']), | |
net_eth0_tx_packets: map_helpers.device('network', getHardwareConfig('network', 'em1'), 'out-packets', ['lo', 'ib0', 'ib1']), | |
"net_ib0_rx": { | |
ref: "network.ib0.in-bytes.avg" | |
}, | |
"net_ib0_rx_packets": { | |
ref: "network.ib0.in-packets.avg" | |
}, | |
"net_ib0_tx": { | |
ref: "network.ib0.out-bytes.avg" | |
}, | |
"net_ib0_tx_packets": { | |
ref: "network.ib0.out-packets.avg" | |
}, | |
gpu_energy: { | |
formula: function (job) { | |
if (!job.gpupower) { | |
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
} | |
var energy = 0.0; | |
var device_count = 0; | |
for (let gpu in job.gpupower) { | |
if (job.gpupower.hasOwnProperty(gpu)) { | |
if (job.gpupower[gpu].energy && job.gpupower[gpu].energy.avg) { | |
energy += job.gpupower[gpu].energy.avg; | |
device_count += 1; | |
} | |
} | |
} | |
if (device_count === 0) { | |
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
} | |
return { value: energy, error: 0 }; | |
} | |
}, | |
gpu_max_power: { | |
formula: function (job) { | |
if (!job.gpupower) { | |
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
} | |
var max_power = 0.0; | |
var device_count = 0; | |
for (let gpu in job.gpupower) { | |
if (job.gpupower.hasOwnProperty(gpu)) { | |
if (job.gpupower[gpu].power && job.gpupower[gpu].power.max) { | |
if (job.gpupower[gpu].power.max.max) { | |
max_power = Math.max(max_power, job.gpupower[gpu].power.max.max); | |
device_count += 1; | |
} else if (job.gpupower[gpu].power.max.avg) { | |
max_power = Math.max(max_power, job.gpupower[gpu].power.max.avg); | |
device_count += 1; | |
} | |
} | |
} | |
} | |
if (device_count === 0) { | |
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
} | |
return { value: max_power, error: 0 }; | |
} | |
}, | |
"gpu0_nv_mem_used": { | |
formula: function(job) { | |
if (!job.gpu) { | |
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
} | |
var job_gpus = this.ref(job, "acct.gpus"); | |
if (job_gpus.value === undefined || job_gpus.value === 0) { | |
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
} | |
var util = 0.0; | |
for (let gpu in job.gpu) { | |
if (job.gpu.hasOwnProperty(gpu)) { | |
if (job.gpu[gpu].memused && job.gpu[gpu].memused.avg) { | |
util += job.gpu[gpu].memused.avg; | |
} | |
} | |
} | |
return { value: util / job_gpus.value, error: 0 }; | |
} | |
}, | |
"gpu0_nv_utilization": { | |
formula: function(job) { | |
if (!job.gpu) { | |
return { value: null, error: this.metricErrors.codes.missingCollectionFailed.value }; | |
} | |
var job_gpus = this.ref(job, "acct.gpus"); | |
if (job_gpus.value === undefined || job_gpus.value === 0) { | |
return { value: null, error: this.metricErrors.codes.metricMissingNotAvailOnThisHost.value }; | |
} | |
var util = 0.0; | |
for (let gpu in job.gpu) { | |
if (job.gpu.hasOwnProperty(gpu)) { | |
if (job.gpu[gpu].util && job.gpu[gpu].util.avg) { | |
util += job.gpu[gpu].util.avg; | |
} | |
} | |
} | |
return { value: util / job_gpus.value / 100.0, error: 0 }; | |
} | |
} | |
} | |
}; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment