-
-
Save michaeldexter/6d6d8fe7010bc8099625b1b5ec17b780 to your computer and use it in GitHub Desktop.
[Dtrace snippets using io::: probes] One-liners and Scripts for observation at lower levels of the IO stack #tags: sd, io, io probes, stable probes, block device, dtrace, bandwidth, throughput, io errors, io retries
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
BEGIN { | |
printf("timestamp(ns),latency(us),iotype,device,filename,blk.number,blk.count"); | |
} | |
io:::start { | |
ts[args[0]->b_edev, args[0]->b_blkno] = timestamp; | |
} | |
io:::done /ts[args[0]->b_edev, args[0]->b_blkno] != 0/ { | |
this->lat_us = (timestamp - ts[args[0]->b_edev, args[0]->b_blkno]) | |
/ 1000; | |
this->iotype = args[0]->b_flags & B_WRITE ? "WRITE" : | |
args[0]->b_flags & B_READ ? "READ" : "NULL"; | |
this->device = args[1]->dev_statname; | |
this->fp = args[2]->fi_pathname == "<unknown>" ? "UNKNOWN" : | |
args[2]->fi_pathname == "<none>" ? "UNKNOWN" : | |
args[2]->fi_pathname; | |
this->blocks = args[0]->b_bcount / 512; /* Assuming a 512b formatted dev */ | |
printf("%d,%d,%s,%s,%s,0x%x,%d\n", walltimestamp, | |
this->lat_us, this->iotype, this->device, | |
this->fp, args[0]->b_blkno, this->blocks); | |
}' 2> /dev/null |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -Cs | |
#pragma D option quiet | |
#pragma D option dynvarsize=16M | |
/* | |
* CDDL HEADER START | |
* | |
* The contents of this file are subject to the terms of the | |
* Common Development and Distribution License (the "License"). | |
* You may not use this file except in compliance with the License. | |
* | |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
* or http://www.opensolaris.org/os/licensing. | |
* See the License for the specific language governing permissions | |
* and limitations under the License. | |
* | |
* When distributing Covered Code, include this CDDL HEADER in each | |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
* If applicable, add the following below this CDDL HEADER, with the | |
* fields enclosed by brackets "[]" replaced with your own identifying | |
* information: Portions Copyright [yyyy] [name of copyright owner] | |
* | |
* CDDL HEADER END | |
* | |
* Copyright (c) 2017 Sam Zaydel / RackTop Systems. | |
* | |
* bw-tput-iops-actv-time-hist-csv.d | |
* This script is meant to be a more informative replacement for the | |
* iostat utility. It measures some features that iostat in my opinion | |
* is not useful for. For example, iostat does not offer distributions | |
* at all, whereas here we plot distributions of IO rates as observed | |
* for each disk. Also, iostat does not offer operating ranges for | |
* measurements, which here we define as max - min of some measurement. | |
* | |
* Currently, script is limited to scsi_vhci devices, because they are | |
* ones we normally use for actual data storage. Anything else is either | |
* a locally attached device, or something not used by system, i.e. | |
* USB storage media, etc. | |
*/ | |
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */ | |
unsigned long minratei[dev_t], maxratei[dev_t]; /* IO Rate min and max */ | |
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */ | |
unsigned long iocnt[dev_t, int]; | |
int pend[dev_t]; | |
hrtime_t start[dev_t, uint64_t]; | |
hrtime_t ival_timer[dev_t]; | |
hrtime_t ticks; | |
inline const int NSEC_PER_SEC = 1000000000; | |
inline const int NSEC_PER_MSEC = 1000000; | |
inline const int NSEC_PER_USEC = 1000; | |
inline const int SPA_MAXBLOCKSIZE = 128 << 10; | |
inline const int R = 0; | |
inline const int W = 1; | |
/* | |
* These are the Output Parameter definitions for collected metrics | |
* | |
* sdname == Name of device, i.e. sd0 | |
* mpxiowwn == mpxio device, like: 5000cca24500f698 | |
* actvtm == amount of real, busy time spent processing IO | |
* rangerlat, rangewlat == latency range: max - min for Reads and Writes | |
* totbytes == total number of bytes Read and Written during sample interval | |
* tput,maxtput == mean and maximum or burst throughput | |
* ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause | |
* aviosz,rangeiosz == IO size mean and range: max - min | |
* iops,maxiops == normalized IOPS mean and maximum | |
* avKBps,rangeKBps == normalized IO rate and IO rate range: max - min | |
* // Histogram of IO Rate distribution with 4 buckets // | |
* [ ratelt1MBps ] ==> 1 second interval with < 1,000KB/s bw | |
* [ rate10MBps ] ==> 1 second interval with < 10,000KB/s bw | |
* [ rate100MBps ] ==> 1 second interval with < 100,000KB/s bw | |
* [ rate1GBps ] ==> 1 second interval with < 1,000,000KB/s bw | |
* avtime,maxtime == average and maximum IO completion latency | |
* // Histogram of latency distribution with 6 buckets // | |
* [ timegt1000ms ] ==> >1 second (SLOW!) | |
* [ time100ms ] ==> >50ms && 100ms<= (not acceptable) | |
* [ time50ms ] ==> >25ms && 50ms<= (acceptable) | |
* [ time25ms ] ==> >10ms && 25ms<= (ok) | |
* [ time10ms ] ==> >1ms && 10ms<= (good) | |
* [ timelt1ms ] ==> 1ms< (probably cache) | |
* // Histogram of IO size distribution with 7 buckets // | |
* [ iosztiny ] ==> 4K< | |
* [ iosz4k ] ==> between 4K and 8K | |
* [ iosz8k ] ==> between 8K and 16K | |
* [ iosz16k ] ==> between 16K and 32K | |
* [ iosz32k ] ==> between 32K and 64K | |
* [ iosz64k ] ==> between 64K and 128K | |
* [ ioszbig ] ==> anything 128K or above | |
* avpending == Average measured IOs in processing per sample interval | |
* maxpending == Maximum measured IOs in processing per sample interval | |
* cterr == Counter tracking number of errors per sample interval | |
* ctretry == Counter tracking number of retries per sample interval | |
*/ | |
BEGIN { | |
printf("sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n"); | |
} | |
/* | |
* Set interval timer initial value once for a given device. | |
* Each subsequent update will happen roughly once per second, | |
* or if no IO, whenever there is some IO that triggers io:::done | |
* probe, at which point we determine that timer expired. | |
*/ | |
io:::start | |
/ival_timer[args[0]->b_edev] == 0/ { | |
ival_timer[args[0]->b_edev] = timestamp; | |
} | |
io:::start { | |
start[args[0]->b_edev, args[0]->b_blkno] = timestamp; | |
/* Increment pending IOs by 1 */ | |
pend[args[0]->b_edev]++; | |
} | |
io:::done | |
/start[args[0]->b_edev, args[0]->b_blkno]/ { | |
this->sn = args[1]->dev_statname; | |
this->p = substr(args[1]->dev_pathname, 25, 16); | |
this->p = (strstr(this->p, "disk@") == 0 || | |
strstr(this->p, "disk@") == "") ? this->p : "NA"; | |
this->x = args[0]->b_bcount * 976562; | |
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno]; | |
start[args[0]->b_edev, args[0]->b_blkno] = 0; | |
/* Decrement pending IOs by 1, set to 0 if value < 0 */ | |
pend[args[0]->b_edev]--; | |
pend[args[0]->b_edev] = | |
pend[args[0]->b_edev] > 0 ? | |
pend[args[0]->b_edev] : 0; /* avoid underflow */ | |
/* Total Number of bytes per device */ | |
@totbytes[this->sn, this->p] = sum(args[0]->b_bcount); | |
/* Total nanoseconds of active time per device */ | |
@actv_tm[this->sn, this->p] = sum(this->elapsed); | |
/* | |
* Instead of converting nanoseconds to seconds we multiply | |
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds. | |
* In extreme observations, i.e. stalled IO, we may have huge | |
* this->elapsed values, in which case result will be 0 KB/s, even | |
* if there in fact was a non-zero value of bytes transferred. | |
*/ | |
this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC; | |
this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10; | |
/* Measure IO rate range in KB/s */ | |
@rangeKBps[this->sn, this->p] = | |
max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]); | |
/* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */ | |
/* 1000KB/s <= */ | |
@ratelt1MBps[this->sn, this->p] = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0); | |
/* > 1000KB/s && 10,000KB/s <= */ | |
@rate10MBps[this->sn, this->p] = sum(this->kb_per_sec > 0x3e8 && | |
this->kb_per_sec <= 0x2710 ? 1 : 0); | |
/* > 10,000KB/s && 100,000KB/s <= */ | |
@rate100MBps[this->sn, this->p] = sum(this->kb_per_sec > 0x2710 && | |
this->kb_per_sec <= 0x186a0 ? 1 : 0); | |
/* > 100,000KB/s && 1,000,000 KB/s <= */ | |
@rate1GBps[this->sn, this->p] = sum(this->kb_per_sec > 0x186a0 && | |
this->kb_per_sec <= 0xf4240 ? 1 : 0); | |
/* | |
* Collect minimum and maximum observed rate for later measurement | |
* of range for this metric. | |
*/ | |
minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ? | |
this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ? | |
minratei[args[0]->b_edev] : this->kb_per_sec; | |
maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ? | |
this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ? | |
maxratei[args[0]->b_edev] : this->kb_per_sec; | |
/* Actual Kbytes/sec per device */ | |
@avKBps[this->sn, this->p] = avg(this->kb_per_sec); | |
/* Average and Maximum Latency per device */ | |
@avtime[this->sn, this->p] = avg(this->elapsed); | |
@maxtime[this->sn, this->p] = max(this->elapsed); | |
/* | |
* Latency histogram with buckets: | |
* >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms< | |
*/ | |
@timegt1000ms[this->sn, this->p] = sum( | |
this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0); | |
@time100ms[this->sn, this->p] = sum( | |
this->elapsed > 50 * NSEC_PER_MSEC && | |
this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0); | |
@time50ms[this->sn, this->p] = sum( | |
this->elapsed > 25 * NSEC_PER_MSEC && | |
this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0); | |
@time25ms[this->sn, this->p] = sum( | |
this->elapsed > 10 * NSEC_PER_MSEC && | |
this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0); | |
@time10ms[this->sn, this->p] = sum( | |
this->elapsed > 1 * NSEC_PER_MSEC && | |
this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0); | |
@timelt1ms[this->sn, this->p] = sum( | |
this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0); | |
/* | |
* Collect minimum and maximum io size for later measurement | |
* of range for this metric. | |
*/ | |
miniosz[args[0]->b_edev] = | |
args[0]->b_bcount < miniosz[args[0]->b_edev] ? | |
args[0]->b_bcount : miniosz[args[0]->b_edev]; | |
maxiosz[args[0]->b_edev] = | |
args[0]->b_bcount > maxiosz[args[0]->b_edev] ? | |
args[0]->b_bcount : maxiosz[args[0]->b_edev]; | |
/* Measure IO size range in Kilobytes */ | |
@rangeiosz[this->sn, this->p] = | |
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10); | |
/* | |
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain | |
* avg. effective throughput, and maximum effective throughput. | |
* Maximum throughput is likely measuring cache effects. | |
*/ | |
@tput[this->sn, this->p] = avg(this->x / this->elapsed); | |
@maxtput[this->sn, this->p] = max(this->x / this->elapsed); | |
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0; | |
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0; | |
/* Count number of IOs by IO-type */ | |
@ctrd[this->sn, this->p] = sum(args[0]->b_flags & B_READ ? 1 : 0); | |
@ctwr[this->sn, this->p] = sum(args[0]->b_flags & B_WRITE ? 1 : 0); | |
@iops[this->sn, this->p] = | |
count(); /* Normalized to per second in tick-X probe */ | |
/* Maximum sampled IOPS per device */ | |
@maxiops[this->sn, this->p] = | |
max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]); | |
/* | |
* Collect minimum and maximum latency for later measurement | |
* of range for this metric. | |
*/ | |
minlati[args[0]->b_edev, R] = | |
args[0]->b_flags & B_READ && | |
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed : | |
minlati[args[0]->b_edev, R]; | |
maxlati[args[0]->b_edev, R] = | |
args[0]->b_flags & B_READ && | |
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed : | |
maxlati[args[0]->b_edev, R]; | |
minlati[args[0]->b_edev, W] = | |
(args[0]->b_flags & B_READ) == 0 && | |
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed : | |
minlati[args[0]->b_edev, W]; | |
maxlati[args[0]->b_edev, W] = | |
(args[0]->b_flags & B_READ) == 0 && | |
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed : | |
maxlati[args[0]->b_edev, W]; | |
/* | |
* IOsize distribution not grouped by direction, i.e. no distinction | |
* is made between reads and writes. IO buckets double in size from | |
* previous bucket. i.e. 4, 8, 16, 32... | |
*/ | |
this->bs = args[0]->b_bcount ; | |
/* 4K< */ | |
@iosztiny[this->sn, this->p] = | |
sum(this->bs < 0x1000 ? 1 : 0); | |
/* 4K to 8K< */ | |
@iosz4k[this->sn, this->p] = | |
sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0); | |
/* 8K to 16K< */ | |
@iosz8k[this->sn, this->p] = | |
sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0); | |
/* 16K to 32K< */ | |
@iosz16k[this->sn, this->p] = | |
sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0); | |
/* 32K to 64K< */ | |
@iosz32k[this->sn, this->p] = | |
sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0); | |
/* 64K to 128K< */ | |
@iosz64k[this->sn, this->p] = | |
sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0); | |
/* >128K */ | |
@ioszbig[this->sn, this->p] = | |
sum(this->bs >= 0x20000 ? 1 : 0); | |
/* Average IO size for given device */ | |
@aviosz[this->sn, this->p] = avg(this->bs); | |
/* | |
* Each time we observe an error at completion through B_ERROR flag, | |
* increment count of errors for given device. This should always | |
* be zero, assuming healthy device. | |
*/ | |
@cterr[this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0); | |
} | |
/* | |
* Entry controlled by timer. By design, each device will be registered | |
* here about once per second when there is even a litte bit of IO. | |
*/ | |
io:::done | |
/ival_timer[args[0]->b_edev] > 0 && | |
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ { | |
this->sn = args[1]->dev_statname; | |
this->p = substr(args[1]->dev_pathname, 25, 16); | |
this->p = (strstr(this->p, "disk@") == 0 || | |
strstr(this->p, "disk@") == "") ? this->p : "NA"; | |
/* | |
* Measure operating latency range in uS for Reads and Writes, | |
* storing largest observed difference. | |
*/ | |
@rangerlat[this->sn, this->p] = | |
max((maxlati[args[0]->b_edev, R] - | |
minlati[args[0]->b_edev, R]) / NSEC_PER_USEC); | |
@rangewlat[this->sn, this->p] = | |
max((maxlati[args[0]->b_edev, W] - | |
minlati[args[0]->b_edev, W]) / NSEC_PER_USEC); | |
@avpending[this->sn, this->p] = avg(pend[args[0]->b_edev]); | |
@maxpending[this->sn, this->p] = max(pend[args[0]->b_edev]); | |
/* Reset various counters for next measurement period */ | |
minlati[args[0]->b_edev, R] = 0; | |
maxlati[args[0]->b_edev, R] = 0; | |
minlati[args[0]->b_edev, W] = 0; | |
maxlati[args[0]->b_edev, W] = 0; | |
iocnt[args[0]->b_edev, R] = 0; | |
iocnt[args[0]->b_edev, W] = 0; | |
miniosz[args[0]->b_edev] = 0; | |
maxiosz[args[0]->b_edev] = 0; | |
minratei[args[0]->b_edev] = 0; | |
maxratei[args[0]->b_edev] = 0; | |
ival_timer[args[0]->b_edev] = timestamp; | |
} | |
/* | |
* Count number of retries issued to a disk. These are a good | |
* indicator of potentially failing, or borderline device. | |
* Under normal circumstances we should not expect | |
* this to be a positive value. | |
*/ | |
::sd_set_retry_bp:entry | |
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" && | |
xlate <devinfo_t *>(args[1])->dev_pathname != "" / { | |
this->sn = xlate <devinfo_t *>(args[1])->dev_statname; | |
this->p = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16); | |
this->p = (strstr(this->p, "disk@") == 0 || | |
strstr(this->p, "disk@") == "") ? this->p : "NA"; | |
@ctretry[this->sn, this->p] = count(); | |
} | |
tick-10sec { | |
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */ | |
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ; | |
/* Normalize Data for correct per second reporting of rates, like IOPS */ | |
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */ | |
normalize(@iops, this->elapsed); | |
printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", | |
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, | |
@ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps, | |
@ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime, | |
@timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms, | |
@iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig, | |
@avpending, @maxpending, @cterr, @ctretry); | |
trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes); | |
trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz); | |
trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps); | |
trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime); | |
trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k); | |
trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig); | |
trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry); | |
trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms); | |
trunc(@time10ms); trunc(@timelt1ms); | |
ticks = timestamp ; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -Cs | |
#pragma D option quiet | |
#pragma D option dynvarsize=16M | |
/* | |
* CDDL HEADER START | |
* | |
* The contents of this file are subject to the terms of the | |
* Common Development and Distribution License (the "License"). | |
* You may not use this file except in compliance with the License. | |
* | |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
* or http://www.opensolaris.org/os/licensing. | |
* See the License for the specific language governing permissions | |
* and limitations under the License. | |
* | |
* When distributing Covered Code, include this CDDL HEADER in each | |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
* If applicable, add the following below this CDDL HEADER, with the | |
* fields enclosed by brackets "[]" replaced with your own identifying | |
* information: Portions Copyright [yyyy] [name of copyright owner] | |
* | |
* CDDL HEADER END | |
* | |
* Copyright (c) 2017 Sam Zaydel / RackTop Systems. | |
* | |
* bw-tput-iops-actv-time-hist-with-ts-csv.d | |
* This script is meant to be a more informative replacement for the | |
* iostat utility. It measures some features that iostat in my opinion | |
* is not useful for. For example, iostat does not offer distributions | |
* at all, whereas here we plot distributions of IO rates as observed | |
* for each disk. Also, iostat does not offer operating ranges for | |
* measurements, which here we define as max - min of some measurement. | |
* | |
* Currently, script is limited to scsi_vhci devices, because they are | |
* ones we normally use for actual data storage. Anything else is either | |
* a locally attached device, or something not used by system, i.e. | |
* USB storage media, etc. | |
*/ | |
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */ | |
unsigned long minratei[dev_t], maxratei[dev_t]; /* IO Rate min and max */ | |
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */ | |
unsigned long iocnt[dev_t, int]; | |
int pend[dev_t]; | |
hrtime_t start[dev_t, uint64_t]; | |
hrtime_t ival_timer[dev_t]; | |
hrtime_t ticks; | |
inline const int NSEC_PER_SEC = 1000000000; | |
inline const int NSEC_PER_MSEC = 1000000; | |
inline const int NSEC_PER_USEC = 1000; | |
inline const int SPA_MAXBLOCKSIZE = 128 << 10; | |
inline const int R = 0; | |
inline const int W = 1; | |
/* | |
* These are the Output Parameter definitions for collected metrics | |
* | |
* sdname == Name of device, i.e. sd0 | |
* mpxiowwn == mpxio device, like: 5000cca24500f698 | |
* actvtm == amount of real, busy time spent processing IO | |
* rangerlat, rangewlat == latency range: max - min for Reads and Writes | |
* totbytes == total number of bytes Read and Written during sample interval | |
* tput,maxtput == mean and maximum or burst throughput | |
* ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause | |
* aviosz,rangeiosz == IO size mean and range: max - min | |
* iops,maxiops == normalized IOPS mean and maximum | |
* avKBps,rangeKBps == normalized IO rate and IO rate range: max - min | |
* // Histogram of IO Rate distribution with 4 buckets // | |
* [ ratelt1MBps ] ==> 1 second interval with < 1,000KB/s bw | |
* [ rate10MBps ] ==> 1 second interval with < 10,000KB/s bw | |
* [ rate100MBps ] ==> 1 second interval with < 100,000KB/s bw | |
* [ rate1GBps ] ==> 1 second interval with < 1,000,000KB/s bw | |
* avtime,maxtime == average and maximum IO completion latency | |
* // Histogram of latency distribution with 6 buckets // | |
* [ timegt1000ms ] ==> >1 second (SLOW!) | |
* [ time100ms ] ==> >50ms && 100ms<= (not acceptable) | |
* [ time50ms ] ==> >25ms && 50ms<= (acceptable) | |
* [ time25ms ] ==> >10ms && 25ms<= (ok) | |
* [ time10ms ] ==> >1ms && 10ms<= (good) | |
* [ timelt1ms ] ==> 1ms< (probably cache) | |
* // Histogram of IO size distribution with 7 buckets // | |
* [ iosztiny ] ==> 4K< | |
* [ iosz4k ] ==> between 4K and 8K | |
* [ iosz8k ] ==> between 8K and 16K | |
* [ iosz16k ] ==> between 16K and 32K | |
* [ iosz32k ] ==> between 32K and 64K | |
* [ iosz64k ] ==> between 64K and 128K | |
* [ ioszbig ] ==> anything 128K or above | |
* avpending == Average measured IOs in processing per sample interval | |
* maxpending == Maximum measured IOs in processing per sample interval | |
* cterr == Counter tracking number of errors per sample interval | |
* ctretry == Counter tracking number of retries per sample interval | |
*/ | |
BEGIN { | |
printf("timestamp,sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n"); | |
interval = walltimestamp - (walltimestamp%1000000000); | |
} | |
/* | |
* Set interval timer initial value once for a given device. | |
* Each subsequent update will happen roughly once per second, | |
* or if no IO, whenever there is some IO that triggers io:::done | |
* probe, at which point we determine that timer expired. | |
*/ | |
io:::start | |
/ival_timer[args[0]->b_edev] == 0/ { | |
ival_timer[args[0]->b_edev] = timestamp; | |
} | |
io:::start { | |
start[args[0]->b_edev, args[0]->b_blkno] = timestamp; | |
/* Increment pending IOs by 1 */ | |
pend[args[0]->b_edev]++; | |
} | |
io:::done | |
/start[args[0]->b_edev, args[0]->b_blkno]/ { | |
this->sn = args[1]->dev_statname; | |
this->p = substr(args[1]->dev_pathname, 25, 16); | |
this->p = (strstr(this->p, "disk@") == 0 || | |
strstr(this->p, "disk@") == "") ? this->p : "NA"; | |
this->x = args[0]->b_bcount * 976562; | |
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno]; | |
start[args[0]->b_edev, args[0]->b_blkno] = 0; | |
/* Decrement pending IOs by 1, set to 0 if value < 0 */ | |
pend[args[0]->b_edev]--; | |
pend[args[0]->b_edev] = | |
pend[args[0]->b_edev] > 0 ? | |
pend[args[0]->b_edev] : 0; /* avoid underflow */ | |
/* Total Number of bytes per device */ | |
@totbytes[interval, this->sn, this->p] = sum(args[0]->b_bcount); | |
/* Total nanoseconds of active time per device */ | |
@actv_tm[interval, this->sn, this->p] = sum(this->elapsed); | |
/* | |
* Instead of converting nanoseconds to seconds we multiply | |
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds. | |
* In extreme observations, i.e. stalled IO, we may have huge | |
* this->elapsed values, in which case result will be 0 KB/s, even | |
* if there in fact was a non-zero value of bytes transferred. | |
*/ | |
this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC; | |
this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10; | |
/* Measure IO rate range in KB/s */ | |
@rangeKBps[interval, this->sn, this->p] = | |
max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]); | |
/* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */ | |
/* 1000KB/s <= */ | |
@ratelt1MBps[interval, this->sn, this->p] = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0); | |
/* > 1000KB/s && 10,000KB/s <= */ | |
@rate10MBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x3e8 && | |
this->kb_per_sec <= 0x2710 ? 1 : 0); | |
/* > 10,000KB/s && 100,000KB/s <= */ | |
@rate100MBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x2710 && | |
this->kb_per_sec <= 0x186a0 ? 1 : 0); | |
/* > 100,000KB/s && 1,000,000 KB/s <= */ | |
@rate1GBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x186a0 && | |
this->kb_per_sec <= 0xf4240 ? 1 : 0); | |
/* | |
* Collect minimum and maximum observed rate for later measurement | |
* of range for this metric. | |
*/ | |
minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ? | |
this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ? | |
minratei[args[0]->b_edev] : this->kb_per_sec; | |
maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ? | |
this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ? | |
maxratei[args[0]->b_edev] : this->kb_per_sec; | |
/* Actual Kbytes/sec per device */ | |
@avKBps[interval, this->sn, this->p] = avg(this->kb_per_sec); | |
/* Average and Maximum Latency per device */ | |
@avtime[interval, this->sn, this->p] = avg(this->elapsed); | |
@maxtime[interval, this->sn, this->p] = max(this->elapsed); | |
/* | |
* Latency histogram with buckets: | |
* >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms< | |
*/ | |
@timegt1000ms[interval, this->sn, this->p] = sum( | |
this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0); | |
@time100ms[interval, this->sn, this->p] = sum( | |
this->elapsed > 50 * NSEC_PER_MSEC && | |
this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0); | |
@time50ms[interval, this->sn, this->p] = sum( | |
this->elapsed > 25 * NSEC_PER_MSEC && | |
this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0); | |
@time25ms[interval, this->sn, this->p] = sum( | |
this->elapsed > 10 * NSEC_PER_MSEC && | |
this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0); | |
@time10ms[interval, this->sn, this->p] = sum( | |
this->elapsed > 1 * NSEC_PER_MSEC && | |
this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0); | |
@timelt1ms[interval, this->sn, this->p] = sum( | |
this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0); | |
/* | |
* Collect minimum and maximum io size for later measurement | |
* of range for this metric. | |
*/ | |
miniosz[args[0]->b_edev] = | |
args[0]->b_bcount < miniosz[args[0]->b_edev] ? | |
args[0]->b_bcount : miniosz[args[0]->b_edev]; | |
maxiosz[args[0]->b_edev] = | |
args[0]->b_bcount > maxiosz[args[0]->b_edev] ? | |
args[0]->b_bcount : maxiosz[args[0]->b_edev]; | |
/* Measure IO size range in Kilobytes */ | |
@rangeiosz[interval, this->sn, this->p] = | |
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10); | |
/* | |
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain | |
* avg. effective throughput, and maximum effective throughput. | |
* Maximum throughput is likely measuring cache effects. | |
*/ | |
@tput[interval, this->sn, this->p] = avg(this->x / this->elapsed); | |
@maxtput[interval, this->sn, this->p] = max(this->x / this->elapsed); | |
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0; | |
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0; | |
/* Count number of IOs by IO-type */ | |
@ctrd[interval, this->sn, this->p] = sum(args[0]->b_flags & B_READ ? 1 : 0); | |
@ctwr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_WRITE ? 1 : 0); | |
@iops[interval, this->sn, this->p] = | |
count(); /* Normalized to per second in tick-X probe */ | |
/* Maximum sampled IOPS per device */ | |
@maxiops[interval, this->sn, this->p] = | |
max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]); | |
/* | |
* Collect minimum and maximum latency for later measurement | |
* of range for this metric. | |
*/ | |
minlati[args[0]->b_edev, R] = | |
args[0]->b_flags & B_READ && | |
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed : | |
minlati[args[0]->b_edev, R]; | |
maxlati[args[0]->b_edev, R] = | |
args[0]->b_flags & B_READ && | |
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed : | |
maxlati[args[0]->b_edev, R]; | |
minlati[args[0]->b_edev, W] = | |
(args[0]->b_flags & B_READ) == 0 && | |
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed : | |
minlati[args[0]->b_edev, W]; | |
maxlati[args[0]->b_edev, W] = | |
(args[0]->b_flags & B_READ) == 0 && | |
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed : | |
maxlati[args[0]->b_edev, W]; | |
/* | |
* IOsize distribution not grouped by direction, i.e. no distinction | |
* is made between reads and writes. IO buckets double in size from | |
* previous bucket. i.e. 4, 8, 16, 32... | |
*/ | |
this->bs = args[0]->b_bcount ; | |
/* 4K< */ | |
@iosztiny[interval, this->sn, this->p] = | |
sum(this->bs < 0x1000 ? 1 : 0); | |
/* 4K to 8K< */ | |
@iosz4k[interval, this->sn, this->p] = | |
sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0); | |
/* 8K to 16K< */ | |
@iosz8k[interval, this->sn, this->p] = | |
sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0); | |
/* 16K to 32K< */ | |
@iosz16k[interval, this->sn, this->p] = | |
sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0); | |
/* 32K to 64K< */ | |
@iosz32k[interval, this->sn, this->p] = | |
sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0); | |
/* 64K to 128K< */ | |
@iosz64k[interval, this->sn, this->p] = | |
sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0); | |
/* >128K */ | |
@ioszbig[interval, this->sn, this->p] = | |
sum(this->bs >= 0x20000 ? 1 : 0); | |
/* Average IO size for given device */ | |
@aviosz[interval, this->sn, this->p] = avg(this->bs); | |
/* | |
* Each time we observe an error at completion through B_ERROR flag, | |
* increment count of errors for given device. This should always | |
* be zero, assuming healthy device. | |
*/ | |
@cterr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0); | |
} | |
/* | |
* Entry controlled by timer. By design, each device will be registered | |
* here about once per second when there is even a litte bit of IO. | |
*/ | |
io:::done | |
/ival_timer[args[0]->b_edev] > 0 && | |
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ { | |
this->sn = args[1]->dev_statname; | |
this->p = substr(args[1]->dev_pathname, 25, 16); | |
this->p = (strstr(this->p, "disk@") == 0 || | |
strstr(this->p, "disk@") == "") ? this->p : "NA"; | |
/* | |
* Measure operating latency range in uS for Reads and Writes, | |
* storing largest observed difference. | |
*/ | |
@rangerlat[interval, this->sn, this->p] = | |
max((maxlati[args[0]->b_edev, R] - | |
minlati[args[0]->b_edev, R]) / NSEC_PER_USEC); | |
@rangewlat[interval, this->sn, this->p] = | |
max((maxlati[args[0]->b_edev, W] - | |
minlati[args[0]->b_edev, W]) / NSEC_PER_USEC); | |
@avpending[interval, this->sn, this->p] = avg(pend[args[0]->b_edev]); | |
@maxpending[interval, this->sn, this->p] = max(pend[args[0]->b_edev]); | |
/* Reset various counters for next measurement period */ | |
minlati[args[0]->b_edev, R] = 0; | |
maxlati[args[0]->b_edev, R] = 0; | |
minlati[args[0]->b_edev, W] = 0; | |
maxlati[args[0]->b_edev, W] = 0; | |
iocnt[args[0]->b_edev, R] = 0; | |
iocnt[args[0]->b_edev, W] = 0; | |
miniosz[args[0]->b_edev] = 0; | |
maxiosz[args[0]->b_edev] = 0; | |
minratei[args[0]->b_edev] = 0; | |
maxratei[args[0]->b_edev] = 0; | |
ival_timer[args[0]->b_edev] = timestamp; | |
} | |
/* | |
* Count number of retries issued to a disk. These are a good | |
* indicator of potentially failing, or borderline device. | |
* Under normal circumstances we should not expect | |
* this to be a positive value. | |
*/ | |
::sd_set_retry_bp:entry | |
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" && | |
xlate <devinfo_t *>(args[1])->dev_pathname != "" / { | |
this->sn = xlate <devinfo_t *>(args[1])->dev_statname; | |
this->p = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16); | |
this->p = (strstr(this->p, "disk@") == 0 || | |
strstr(this->p, "disk@") == "") ? this->p : "NA"; | |
@ctretry[interval, this->sn, this->p] = count(); | |
} | |
tick-10sec { | |
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */ | |
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ; | |
/* Normalize Data for correct per second reporting of rates, like IOPS */ | |
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */ | |
normalize(@iops, this->elapsed); | |
printa("%ld,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", | |
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, | |
@ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps, | |
@ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime, | |
@timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms, | |
@iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig, | |
@avpending, @maxpending, @cterr, @ctretry); | |
trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes); | |
trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz); | |
trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps); | |
trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime); | |
trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k); | |
trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig); | |
trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry); | |
trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms); | |
trunc(@time10ms); trunc(@timelt1ms); | |
ticks = timestamp ; | |
interval = walltimestamp - (walltimestamp%1000000000); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -Cs | |
#pragma D option quiet | |
#pragma D option dynvarsize=10M | |
/* | |
* CDDL HEADER START | |
* | |
* The contents of this file are subject to the terms of the | |
* Common Development and Distribution License (the "License"). | |
* You may not use this file except in compliance with the License. | |
* | |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
* or http://www.opensolaris.org/os/licensing. | |
* See the License for the specific language governing permissions | |
* and limitations under the License. | |
* | |
* When distributing Covered Code, include this CDDL HEADER in each | |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
* If applicable, add the following below this CDDL HEADER, with the | |
* fields enclosed by brackets "[]" replaced with your own identifying | |
* information: Portions Copyright [yyyy] [name of copyright owner] | |
* | |
* CDDL HEADER END | |
* | |
* Copyright (c) 2017 Sam Zaydel / RackTop Systems. | |
* | |
* bw-tput-iops-actv-time-hist.d | |
* This script is meant to be a more informative replacement for the | |
* iostat utility. It measures some features that iostat in my opinion | |
* is not useful for. For example, iostat does not offer distributions | |
* at all, whereas here we plot distributions of bandwidth as observed | |
* for each disk. Also, iostat does not offer operating ranges for | |
* measurements, which here we define as max - min of some measurement. | |
*/ | |
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */ | |
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */ | |
unsigned long long bufcnt[dev_t], ts[dev_t]; | |
unsigned long long iocnt[dev_t, int]; | |
self unsigned long int tottime[dev_t]; | |
inline const int NSEC_PER_SEC = 1000000000; | |
inline const int SPA_MAXBLOCKSIZE = 128 << 10; | |
inline const int R = 0; | |
inline const int W = 1; | |
io:::start /ts[args[0]->b_edev] == 0/ { | |
ts[args[0]->b_edev] = timestamp; | |
/* | |
* We need to seed minimum value with something other than zero. | |
* Because this value is never signed, zero is by definition smallest | |
* value will result in range potentially spanning 0 => SPA_MAXBLOCKSIZE. | |
* As soon as any IO SPA_MAXBLOCKSIZE< is encountered, this value will | |
* update accordingly. | |
*/ | |
miniosz[args[0]->b_edev] = SPA_MAXBLOCKSIZE; | |
} | |
io:::start { | |
start[args[0]->b_edev, args[0]->b_blkno] = timestamp; | |
} | |
io:::done /start[args[0]->b_edev, args[0]->b_blkno]/ { | |
this->sn = args[1]->dev_statname; | |
this->p = args[1]->dev_pathname; | |
this->x = args[0]->b_bcount * 976562; | |
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno]; | |
bufcnt[args[0]->b_edev] += args[0]->b_bcount ; /* sum of bytes */ | |
self->tottime[args[0]->b_edev] += this->elapsed; | |
start[args[0]->b_edev, args[0]->b_blkno] = 0; | |
/* | |
* Collect minimum and maximum io size for later measurement | |
* of range for this metric. | |
*/ | |
miniosz[args[0]->b_edev] = | |
args[0]->b_bcount < miniosz[args[0]->b_edev] ? | |
args[0]->b_bcount : miniosz[args[0]->b_edev]; | |
maxiosz[args[0]->b_edev] = | |
args[0]->b_bcount > maxiosz[args[0]->b_edev] ? | |
args[0]->b_bcount : maxiosz[args[0]->b_edev]; | |
/* | |
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain | |
* avg. effective throughput, and maximum effective throughput. | |
* Maximum throughput is likely measuring cache effects. | |
*/ | |
@tput[this->sn, this->p] = avg(this->x / this->elapsed); | |
@maxtput[this->sn, this->p] = max(this->x / this->elapsed); | |
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0; | |
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_READ ? 0 : 1; | |
/* | |
* Collect minimum and maximum latency for later measurement | |
* of range for this metric. | |
*/ | |
minlati[args[0]->b_edev, R] = | |
args[0]->b_flags & B_READ && | |
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed : | |
minlati[args[0]->b_edev, R]; | |
maxlati[args[0]->b_edev, R] = | |
args[0]->b_flags & B_READ && | |
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed : | |
maxlati[args[0]->b_edev, R]; | |
minlati[args[0]->b_edev, W] = | |
(args[0]->b_flags & B_READ) == 0 && | |
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed : | |
minlati[args[0]->b_edev, W]; | |
maxlati[args[0]->b_edev, W] = | |
(args[0]->b_flags & B_READ) == 0 && | |
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed : | |
maxlati[args[0]->b_edev, W]; | |
} | |
io:::done | |
/timestamp >= ts[args[0]->b_edev] + NSEC_PER_SEC/ { | |
this->sn = args[1]->dev_statname; | |
this->p = args[1]->dev_pathname; | |
this->delta = timestamp - ts[args[0]->b_edev]; | |
/* | |
* Instead of converting nanoseconds to seconds we multiply | |
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds. | |
*/ | |
this->b_nsec = (bufcnt[args[0]->b_edev]) * NSEC_PER_SEC; | |
this->kb_per_sec = (this->b_nsec / this->delta) >> 10; | |
this->ioct = iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]; | |
this->iops = (this->ioct * NSEC_PER_SEC) / this->delta; | |
/* Actual Kbytes/sec per device */ | |
@bw[this->sn, this->p] = quantize(this->kb_per_sec); | |
@avbw[this->sn, this->p] = avg(this->kb_per_sec); | |
@maxbw[this->sn, this->p] = max(this->kb_per_sec); | |
/* Total Number of bytes per device */ | |
@totbytes[this->sn, this->p] = sum(bufcnt[args[0]->b_edev]); | |
/* Total nanoseconds of active time per device */ | |
@actv_tm[this->sn, this->p] = sum(self->tottime[args[0]->b_edev]); | |
/* Average IOPS per device */ | |
@aviops[this->sn, this->p] = avg(this->iops); | |
@maxiops[this->sn, this->p] = max(this->iops); | |
/* Average IO size per device */ | |
@aviosz[this->sn, this->p] = avg(args[0]->b_bcount); | |
/* Count number of IOs by IO-type */ | |
@ctrd[this->sn, this->p] = sum(iocnt[args[0]->b_edev, R]); | |
@ctwr[this->sn, this->p] = sum(iocnt[args[0]->b_edev, W]); | |
/* Measure operating latency range in uS for Reads and Writes */ | |
@rangerlat[this->sn, this->p] = | |
max((maxlati[args[0]->b_edev, R] - | |
minlati[args[0]->b_edev, R]) / 1000); /* largest difference stored */ | |
@rangewlat[this->sn, this->p] = | |
max((maxlati[args[0]->b_edev, W] - | |
minlati[args[0]->b_edev, W]) / 1000); /* largest difference stored */ | |
@rangeiosz[this->sn, this->p] = | |
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10); | |
/* Reset various counters for next measurement period */ | |
minlati[args[0]->b_edev, R] = 0; | |
maxlati[args[0]->b_edev, R] = 0; | |
minlati[args[0]->b_edev, W] = 0; | |
maxlati[args[0]->b_edev, W] = 0; | |
iocnt[args[0]->b_edev, R] = 0; | |
iocnt[args[0]->b_edev, W] = 0; | |
bufcnt[args[0]->b_edev] = 0; | |
self->tottime[args[0]->b_edev] = 0; | |
ts[args[0]->b_edev] = timestamp; | |
} | |
END { | |
normalize(@actv_tm, 1000000); /* from nanoseconds to milliseconds */ | |
normalize(@totbytes, 1024); /* from bytes to kilobytes */ | |
printa("\t %s (%s)\n\t Active Time: %@d(ms) | Lat. Range[max-min] Read: %@d(uS) / Write: %@d(uS)\n\t Volume: %@d(KB) | Tput Avg: %@d(KB/s) / Max: %@d(KB/s)\n\t IOs Read: %@d / Write: %@d | IO Size Avg: %@d(b) / Range[max-min]: %@d(KB)\n\t IOPS Avg: %@d | Max: %@d | Bandwidth Avg: %@d(KB/s) | Max: %@d(KB/s)\n\t Bandwidth Histogram(KB/s)%@d\n", | |
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput, | |
@ctrd, @ctwr, @aviosz, @rangeiosz, @aviops, @maxiops, @avbw, @maxbw, @bw); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Identify devices to which SYNC CACHE commands are being sent | |
dtrace -qn '::sd_send_scsi_SYNCHRONIZE_CACHE:entry {print(stringof(args[0]->un_sd->sd_inq->inq_vid))}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma D option quiet | |
BEGIN | |
{ | |
start = timestamp; | |
} | |
io:::start | |
{ | |
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp; | |
} | |
io:::done | |
/ts[args[0]->b_edev, args[0]->b_lblkno]/ | |
{ | |
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000; | |
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? | |
"read " : "write "; | |
@q[this->name] = quantize(this->delta); | |
@a[this->name] = avg(this->delta); | |
@v[this->name] = stddev(this->delta); | |
@i[this->name] = count(); | |
@b[this->name] = sum(args[0]->b_bcount); | |
ts[args[0]->b_edev, args[0]->b_lblkno] = 0; | |
} | |
END | |
{ | |
printa(@q); | |
normalize(@i, (timestamp - start) / 1000000000); | |
normalize(@b, (timestamp - start) / 1000000000 * 1024); | |
printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev", | |
"iops", "throughput"); | |
printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b); | |
} | |
/* | |
# dtrace -s rw.d -c 'sleep 60' | |
read | |
value ------------- Distribution ------------- count | |
32 | 0 | |
64 | 23 | |
128 |@ 655 | |
256 |@@@@ 1638 | |
512 |@@ 743 | |
1024 |@ 380 | |
2048 |@@@ 1341 | |
4096 |@@@@@@@@@@@@ 5295 | |
8192 |@@@@@@@@@@@ 5033 | |
16384 |@@@ 1297 | |
32768 |@@ 684 | |
65536 |@ 400 | |
131072 | 225 | |
262144 | 206 | |
524288 | 127 | |
1048576 | 19 | |
2097152 | 0 | |
write | |
value ------------- Distribution ------------- count | |
32 | 0 | |
64 | 47 | |
128 | 469 | |
256 | 591 | |
512 | 327 | |
1024 | 924 | |
2048 |@ 6734 | |
4096 |@@@@@@@ 43416 | |
8192 |@@@@@@@@@@@@@@@@@ 102013 | |
16384 |@@@@@@@@@@ 60992 | |
32768 |@@@ 20312 | |
65536 |@ 6789 | |
131072 | 860 | |
262144 | 208 | |
524288 | 153 | |
1048576 | 36 | |
2097152 | 0 | |
avg latency stddev iops throughput | |
write 19442us 32468us 4064/s 261889k/s | |
read 23733us 88206us 301/s 13113k/s | |
*/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get a quick sense of the size(s) of IO, quantized, grouped by (device, Read/Write Ops). | |
dtrace -n 'io:::start {@[args[1]->dev_pathname, args[0]->b_flags & B_READ ? "READ" : "WRITE"] = quantize(args[0]->b_bcount);}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
int activ[string]; | |
BEGIN { | |
start = timestamp; | |
} | |
io:::start { | |
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp; | |
} | |
io:::done | |
/ts[args[0]->b_edev, args[0]->b_lblkno]/ { | |
this->delta = | |
(timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000; | |
this->name = | |
(args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? | |
"read" : "write"; | |
/* Convert bytes to KB, to measure microseconds per KB */ | |
this->kb = (args[0]->b_bcount>>10) ? (args[0]->b_bcount>>10) > 0 : 1; | |
@q[this->name] = quantize(this->delta); | |
/* Above graph should support this mean value. Largest count on histogram | |
* is expected to hover in the three buckets closest to the mean value. | |
*/ | |
@a[this->name] = avg(this->delta); | |
@v[this->name] = stddev(this->delta); | |
@i[this->name] = count(); | |
@b[this->name] = sum(args[0]->b_bcount); | |
/* Time spent per KB of IO. This is an unusual metric, but may prove | |
* useful at some point. | |
*/ | |
@kbt[this->name] = quantize(this->delta / this->kb); | |
ts[args[0]->b_edev, args[0]->b_lblkno] = 0; | |
} | |
END { | |
this->elapsed = (timestamp - start) / 1000000000; | |
printa("\tOP: %s\n\ttime spent/KB (us): %@d\n", @kbt); | |
printa("\tOP: %s\n\tlatency (us): %@d\n", @q); | |
/* When there is very little IO of a given type, normalization can create | |
* an unacceptable degree of error. We are normalizing over some period of | |
* time, and if we have lots of IO for the given command over a short | |
* interval but the script runs for a much longer period of time, | |
* we will normalize over that time period, which can create incorrect | |
* and perhaps confusing results. | |
*/ | |
normalize(@i, this->elapsed); | |
normalize(@b, this->elapsed * 1024); | |
printf("%-30s %11s %11s %11s %11s\n", "", "avg.latency", "stddev", | |
"IOPS", "throughput"); | |
printa("%-30s %@9uus %@9uus %@9u/s %@8uKB/s\n", @a, @v, @i, @b); | |
}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Is IO aligned on 4K boundary? This is critical for SSDs, and could be impacting | |
// performance to a lesser degree of mechanical devices. | |
dtrace -qn ' | |
io:::start { | |
@[args[0]->b_flags & B_WRITE ? "W" : "R"] = quantize(args[0]->b_blkno % 8); | |
} | |
END { | |
printa("OP: (%s) %@d\n", @) | |
}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn 'BEGIN { | |
x[123] = 0; | |
} | |
io:::start { | |
x[123] = 1; | |
start[arg0] = timestamp; | |
} | |
io:::done /this->start = start[arg0]/ { | |
@delta[args[0]->b_flags & B_WRITE ? "WRITE" : "READ", args[1]->dev_pathname] = quantize((timestamp - this->start) / 1000); self->ts = 0; | |
start[arg0] = 0; | |
} | |
tick-30sec /x[123]/ { | |
printa("%12s (us) Device: %s %@d\n", @delta); | |
trunc(@delta); | |
x[123] = 0; | |
}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Measure throughput per device and get some useful statistics | |
// in a form of a summary that looks like this: | |
// +=======================================+ | |
// |=============== Summary ===============| | |
// +=======================================+ | |
// Start Time: 2017 Jul 13 18:35:46 | |
// Human Time: 3.972 seconds | |
// Active (All drives): 5.460 seconds | |
// Spent 0% of elapsed time Reading | |
// Spent 100% of elapsed time Writing | |
// Average Write throughput: 76993KB/sec | |
// Burst Write throughput: 468351KB/sec | |
// ========================================= | |
// | |
dtrace -qn ' | |
inline const int NSEC_IN_SEC = 1000000000; | |
BEGIN { | |
starttime = walltimestamp; | |
runtime = timestamp; | |
busytime = 0; | |
readtime = 0; | |
writetime = 0; | |
} | |
io:::start { | |
start[args[0]->b_edev, args[0]->b_blkno] = timestamp; | |
} | |
io:::done | |
/start[args[0]->b_edev, args[0]->b_blkno]/ { | |
this->devp = strtok(args[1]->dev_pathname, ":"); | |
this->elapsed = timestamp - | |
start[args[0]->b_edev, args[0]->b_blkno]; | |
/* | |
* Odd looking calculation below is to go from bytes and nanoseconds | |
* to kilobytes(KB) and seconds. bytes/ns => KB/sec. | |
*/ | |
@cumtp[args[0]->b_flags & B_READ ? "Read" : "Write"] = | |
avg((args[0]->b_bcount * 976562) / this->elapsed); | |
@cumtp_max[args[0]->b_flags & B_READ ? "Read" : "Write"] = | |
max((args[0]->b_bcount * 976562) / this->elapsed); | |
@tp[args[1]->dev_statname, this->devp] = | |
quantize((args[0]->b_bcount * 976562) / this->elapsed); | |
readtime += args[0]->b_flags & B_READ ? this->elapsed : 0; | |
writetime += args[0]->b_flags & B_WRITE ? this->elapsed : 0; | |
start[args[0]->b_edev, args[0]->b_blkno] = 0; | |
} | |
END { | |
runtime = timestamp - runtime; | |
/* | |
* Small amount of rounding-off error is possible here, | |
* so the two percentages may not add-up to 100% exactly. | |
*/ | |
pctrd = readtime * 100 / (readtime + writetime); | |
pctwr = writetime * 100 / (readtime + writetime); | |
printf("+=======================================+\n"); | |
printf("|=============== Summary ===============|\n"); | |
printf("+=======================================+\n"); | |
printf("Start Time: %Y\n", starttime); | |
printf("Human Time: %3d.%03d seconds\n", | |
runtime / NSEC_IN_SEC, | |
(runtime / 1000) % 1000); | |
printf("Active (All drives): %3d.%03d seconds\n", | |
(readtime+writetime) / NSEC_IN_SEC, | |
((readtime+writetime) / 1000) % 1000); | |
printf("Spent %3d%% of elapsed time Reading\n", pctrd); | |
printf("Spent %3d%% of elapsed time Writing\n", pctwr); | |
printa("Average %-5s throughput: %@dKB/sec\n", @cumtp); | |
printa("Burst %-5s throughput: %@dKB/sec\n", @cumtp_max); | |
printf("=========================================\n\n"); | |
printa(" %s (%s)\n\t KB/s%@d\n", @tp); | |
}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -s | |
#pragma D option quiet | |
BEGIN | |
{ | |
start = timestamp; | |
} | |
io:::start | |
{ | |
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp; | |
} | |
io:::done | |
/ts[args[0]->b_edev, args[0]->b_lblkno]/ | |
{ | |
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000; | |
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? | |
"read " : "write "; | |
@q[this->name] = quantize(this->delta); | |
@a[this->name] = avg(this->delta); | |
@v[this->name] = stddev(this->delta); | |
@i[this->name] = count(); | |
@b[this->name] = sum(args[0]->b_bcount); | |
ts[args[0]->b_edev, args[0]->b_lblkno] = 0; | |
} | |
END | |
{ | |
printa(@q); | |
normalize(@i, (timestamp - start) / 1000000000); | |
normalize(@b, (timestamp - start) / 1000000000 * 1024); | |
printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev", | |
"iops", "throughput"); | |
printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -s | |
/* | |
* sdretry.d | |
* | |
* Example script from Chapter 4 of the book: DTrace: Dynamic Tracing in | |
* Oracle Solaris, Mac OS X, and FreeBSD", by Brendan Gregg and Jim Mauro, | |
* Prentice Hall, 2011. ISBN-10: 0132091518. http://dtracebook.com. | |
* | |
* See the book for the script description and warnings. Many of these are | |
* provided as example solutions, and will need changes to work on your OS. | |
*/ | |
#pragma D option quiet | |
dtrace:::BEGIN | |
{ | |
printf("Tracing... output every 10 seconds.\n"); | |
} | |
fbt::sd_set_retry_bp:entry | |
{ | |
@[xlate <devinfo_t *>(args[1])->dev_statname, | |
xlate <devinfo_t *>(args[1])->dev_major, | |
xlate <devinfo_t *>(args[1])->dev_minor] = count(); | |
} | |
tick-10sec | |
{ | |
printf("\n%Y:\n", walltimestamp); | |
printf("%28s %-3s,%-4s %s\n", "DEVICE", "MAJ", "MIN", "RETRIES"); | |
printa("%28s %-03d,%-4d %@d\n", @); | |
trunc(@); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment