Created
September 13, 2015 22:14
-
-
Save kernelOfTruth/f93f09b853fcf16090cb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c | |
| index 0fb32ce..6bbde84 100644 | |
| --- a/cmd/zdb/zdb.c | |
| +++ b/cmd/zdb/zdb.c | |
| @@ -2403,6 +2403,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| dmu_object_type_t type; | |
| boolean_t is_metadata; | |
| + if (bp == NULL) | |
| + return (0); | |
| + | |
| if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { | |
| char blkbuf[BP_SPRINTF_LEN]; | |
| snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); | |
| @@ -2892,7 +2895,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| avl_index_t where; | |
| zdb_ddt_entry_t *zdde, zdde_search; | |
| - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| return (0); | |
| if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { | |
| diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c | |
| index ff04e3a..1ed01b4 100644 | |
| --- a/cmd/zfs/zfs_main.c | |
| +++ b/cmd/zfs/zfs_main.c | |
| @@ -247,8 +247,9 @@ get_usage(zfs_help_t idx) | |
| return (gettext("\tpromote <clone-filesystem>\n")); | |
| case HELP_RECEIVE: | |
| return (gettext("\treceive [-vnFu] <filesystem|volume|" | |
| - "snapshot>\n" | |
| - "\treceive [-vnFu] [-d | -e] <filesystem>\n")); | |
| + "snapshot>\n" | |
| + "\treceive [-vnFu] [-o origin=<snapshot>] [-d | -e] " | |
| + "<filesystem>\n")); | |
| case HELP_RENAME: | |
| return (gettext("\trename [-f] <filesystem|volume|snapshot> " | |
| "<filesystem|volume|snapshot>\n" | |
| @@ -751,7 +752,7 @@ zfs_do_create(int argc, char **argv) | |
| nomem(); | |
| break; | |
| case 'o': | |
| - if (parseprop(props, optarg)) | |
| + if (parseprop(props, optarg) != 0) | |
| goto error; | |
| break; | |
| case 's': | |
| @@ -3590,7 +3591,7 @@ zfs_do_snapshot(int argc, char **argv) | |
| while ((c = getopt(argc, argv, "ro:")) != -1) { | |
| switch (c) { | |
| case 'o': | |
| - if (parseprop(props, optarg)) | |
| + if (parseprop(props, optarg) != 0) | |
| return (1); | |
| break; | |
| case 'r': | |
| @@ -3849,10 +3850,19 @@ zfs_do_receive(int argc, char **argv) | |
| { | |
| int c, err; | |
| recvflags_t flags = { 0 }; | |
| + nvlist_t *props; | |
| + nvpair_t *nvp = NULL; | |
| + | |
| + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) | |
| + nomem(); | |
| /* check options */ | |
| - while ((c = getopt(argc, argv, ":denuvF")) != -1) { | |
| + while ((c = getopt(argc, argv, ":o:denuvF")) != -1) { | |
| switch (c) { | |
| + case 'o': | |
| + if (parseprop(props, optarg) != 0) | |
| + return (1); | |
| + break; | |
| case 'd': | |
| flags.isprefix = B_TRUE; | |
| break; | |
| @@ -3897,6 +3907,13 @@ zfs_do_receive(int argc, char **argv) | |
| usage(B_FALSE); | |
| } | |
| + while ((nvp = nvlist_next_nvpair(props, nvp))) { | |
| + if (strcmp(nvpair_name(nvp), "origin") != 0) { | |
| + (void) fprintf(stderr, gettext("invalid option")); | |
| + usage(B_FALSE); | |
| + } | |
| + } | |
| + | |
| if (isatty(STDIN_FILENO)) { | |
| (void) fprintf(stderr, | |
| gettext("Error: Backup stream can not be read " | |
| @@ -3905,7 +3922,7 @@ zfs_do_receive(int argc, char **argv) | |
| return (1); | |
| } | |
| - err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL); | |
| + err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); | |
| return (err != 0); | |
| } | |
| diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c | |
| index 9dcc38b..21b9704 100644 | |
| --- a/cmd/ztest/ztest.c | |
| +++ b/cmd/ztest/ztest.c | |
| @@ -3584,7 +3584,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) | |
| */ | |
| n = ztest_random(regions) * stride + ztest_random(width); | |
| s = 1 + ztest_random(2 * width - 1); | |
| - dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); | |
| + dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| /* | |
| * Pick a random index and compute the offsets into packobj and bigobj. | |
| @@ -5703,8 +5704,10 @@ ztest_run(ztest_shared_t *zs) | |
| * Right before closing the pool, kick off a bunch of async I/O; | |
| * spa_close() should wait for it to complete. | |
| */ | |
| - for (uint64_t object = 1; object < 50; object++) | |
| - dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); | |
| + for (uint64_t object = 1; object < 50; object++) { | |
| + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| + } | |
| spa_close(spa, FTAG); | |
| diff --git a/include/libzfs.h b/include/libzfs.h | |
| index eb550f6..6cc0ef8 100644 | |
| --- a/include/libzfs.h | |
| +++ b/include/libzfs.h | |
| @@ -651,8 +651,8 @@ typedef struct recvflags { | |
| boolean_t nomount; | |
| } recvflags_t; | |
| -extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, | |
| - int, avl_tree_t *); | |
| +extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, | |
| + recvflags_t *, int, avl_tree_t *); | |
| typedef enum diff_flags { | |
| ZFS_DIFF_PARSEABLE = 0x1, | |
| diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c | |
| index 0418091..7488e4e 100644 | |
| --- a/lib/libzfs/libzfs_pool.c | |
| +++ b/lib/libzfs/libzfs_pool.c | |
| @@ -3398,7 +3398,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, | |
| } | |
| static int | |
| -zbookmark_compare(const void *a, const void *b) | |
| +zbookmark_mem_compare(const void *a, const void *b) | |
| { | |
| return (memcmp(a, b, sizeof (zbookmark_phys_t))); | |
| } | |
| @@ -3461,7 +3461,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) | |
| zc.zc_nvlist_dst_size; | |
| count -= zc.zc_nvlist_dst_size; | |
| - qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare); | |
| + qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); | |
| verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); | |
| diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c | |
| index d84e303..b155fc8 100644 | |
| --- a/lib/libzfs/libzfs_sendrecv.c | |
| +++ b/lib/libzfs/libzfs_sendrecv.c | |
| @@ -55,8 +55,9 @@ | |
| /* in libzfs_dataset.c */ | |
| extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); | |
| -static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *, | |
| - int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); | |
| +static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, | |
| + recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, | |
| + uint64_t *); | |
| static const zio_cksum_t zero_cksum = { 0 }; | |
| @@ -2465,7 +2466,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, | |
| * zfs_receive_one() will take care of it (ie, | |
| * recv_skip() and return 0). | |
| */ | |
| - error = zfs_receive_impl(hdl, destname, flags, fd, | |
| + error = zfs_receive_impl(hdl, destname, NULL, flags, fd, | |
| sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, | |
| action_handlep); | |
| if (error == ENODATA) { | |
| @@ -2598,9 +2599,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) | |
| */ | |
| static int | |
| zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | |
| - recvflags_t *flags, dmu_replay_record_t *drr, | |
| - dmu_replay_record_t *drr_noswap, const char *sendfs, | |
| - nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, | |
| + const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, | |
| + dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, | |
| + avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, | |
| uint64_t *action_handlep) | |
| { | |
| zfs_cmd_t zc = { 0 }; | |
| @@ -2756,10 +2757,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | |
| } | |
| if (flags->verbose) | |
| (void) printf("found clone origin %s\n", zc.zc_string); | |
| + } else if (originsnap) { | |
| + (void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN); | |
| + if (flags->verbose) | |
| + (void) printf("using provided clone origin %s\n", | |
| + zc.zc_string); | |
| } | |
| stream_wantsnewfs = (drrb->drr_fromguid == NULL || | |
| - (drrb->drr_flags & DRR_FLAG_CLONE)); | |
| + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap); | |
| if (stream_wantsnewfs) { | |
| /* | |
| @@ -3137,9 +3143,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | |
| } | |
| static int | |
| -zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |
| - int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, | |
| - char **top_zfs, int cleanup_fd, uint64_t *action_handlep) | |
| +zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, | |
| + const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, | |
| + nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, | |
| + uint64_t *action_handlep) | |
| { | |
| int err; | |
| dmu_replay_record_t drr, drr_noswap; | |
| @@ -3158,6 +3165,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |
| "(%s) does not exist"), tosnap); | |
| return (zfs_error(hdl, EZFS_NOENT, errbuf)); | |
| } | |
| + if (originsnap && | |
| + !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { | |
| + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " | |
| + "(%s) does not exist"), originsnap); | |
| + return (zfs_error(hdl, EZFS_NOENT, errbuf)); | |
| + } | |
| /* read in the BEGIN record */ | |
| if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, | |
| @@ -3230,14 +3243,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |
| *cp = '\0'; | |
| sendfs = nonpackage_sendfs; | |
| } | |
| - return (zfs_receive_one(hdl, infd, tosnap, flags, | |
| - &drr, &drr_noswap, sendfs, stream_nv, stream_avl, | |
| - top_zfs, cleanup_fd, action_handlep)); | |
| + return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, | |
| + &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, | |
| + cleanup_fd, action_handlep)); | |
| } else { | |
| assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == | |
| DMU_COMPOUNDSTREAM); | |
| - return (zfs_receive_package(hdl, infd, tosnap, flags, | |
| - &drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); | |
| + return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, | |
| + &zcksum, top_zfs, cleanup_fd, action_handlep)); | |
| } | |
| } | |
| @@ -3248,18 +3261,24 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |
| * (-1 will override -2). | |
| */ | |
| int | |
| -zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |
| - int infd, avl_tree_t *stream_avl) | |
| +zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, | |
| + recvflags_t *flags, int infd, avl_tree_t *stream_avl) | |
| { | |
| char *top_zfs = NULL; | |
| int err; | |
| int cleanup_fd; | |
| uint64_t action_handle = 0; | |
| + char *originsnap = NULL; | |
| + if (props) { | |
| + err = nvlist_lookup_string(props, "origin", &originsnap); | |
| + if (err && err != ENOENT) | |
| + return (err); | |
| + } | |
| cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); | |
| VERIFY(cleanup_fd >= 0); | |
| - err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL, | |
| + err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, | |
| stream_avl, &top_zfs, cleanup_fd, &action_handle); | |
| VERIFY(0 == close(cleanup_fd)); | |
| diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h | |
| index b1154fb..6216006 100644 | |
| --- a/include/sys/zfs_context.h | |
| +++ b/include/sys/zfs_context.h | |
| @@ -116,8 +116,18 @@ extern int aok; | |
| /* | |
| * DTrace SDT probes have different signatures in userland than they do in | |
| - * kernel. If they're being used in kernel code, re-define them out of | |
| + * the kernel. If they're being used in kernel code, re-define them out of | |
| * existence for their counterparts in libzpool. | |
| + * | |
| + * Here's an example of how to use the set-error probes in userland: | |
| + * zfs$target:::set-error /arg0 == EBUSY/ {stack();} | |
| + * | |
| + * Here's an example of how to use DTRACE_PROBE probes in userland: | |
| + * If there is a probe declared as follows: | |
| + * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); | |
| + * Then you can use it as follows: | |
| + * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ | |
| + * {printf("%u %p\n", arg1, arg2);} | |
| */ | |
| #ifdef DTRACE_PROBE | |
| diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 | |
| index c8bcf8c..1e19be8 100644 | |
| --- a/man/man8/zfs.8 | |
| +++ b/man/man8/zfs.8 | |
| @@ -175,11 +175,13 @@ | |
| .Nm | |
| .Cm receive | |
| .Op Fl Fnuv | |
| +.Op Fl o Sy origin Ns = Ns Ar snapshot | |
| .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot | |
| .Nm | |
| .Cm receive | |
| .Op Fl Fnuv | |
| .Op Fl d Ns | Ns Fl e | |
| +.Op Fl o Sy origin Ns = Ns Ar snapshot | |
| .Ar filesystem | |
| .Nm | |
| .Cm allow | |
| @@ -2635,12 +2637,14 @@ origin, etc. | |
| .Nm | |
| .Cm receive | |
| .Op Fl Fnuv | |
| +.Op Fl o Sy origin Ns = Ns Ar snapshot | |
| .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot | |
| .br | |
| .Nm | |
| .Cm receive | |
| .Op Fl Fnuv | |
| .Op Fl d Ns | Ns Fl e | |
| +.Op Fl o Sy origin Ns = Ns Ar snapshot | |
| .Ar filesystem | |
| .Xc | |
| Creates a snapshot whose contents are as specified in the stream provided on | |
| @@ -2730,6 +2734,10 @@ snapshot as described in the paragraph above. | |
| Do not actually receive the stream. This can be useful in conjunction with the | |
| .Fl v | |
| option to verify the name the receive operation would use. | |
| +.It Fl o Sy origin Ns = Ns Ar snapshot | |
| +Forces the stream to be received as a clone of the given snapshot. | |
| +This is only valid if the stream is an incremental stream whose source | |
| +is the same as the provided origin. | |
| .It Fl u | |
| File system that is associated with the received stream is not mounted. | |
| .It Fl v | |
| diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files | |
| index 40caa53..789d562 100644 | |
| --- a/usr/src/uts/common/Makefile.files | |
| +++ b/usr/src/uts/common/Makefile.files | |
| @@ -21,7 +21,9 @@ | |
| # | |
| # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. | |
| -# Copyright (c) 2013 by Delphix. All rights reserved. | |
| +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. | |
| +# Copyright (c) 2012 Joyent, Inc. All rights reserved. | |
| +# Copyright (c) 2011, 2014 by Delphix. All rights reserved. | |
| # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. | |
| # Copyright 2014 Nexenta Systems, Inc. All rights reserved. | |
| # | |
| @@ -1328,6 +1330,7 @@ ZFS_COMMON_OBJS += \ | |
| bplist.o \ | |
| bpobj.o \ | |
| bptree.o \ | |
| + bqueue.o \ | |
| dbuf.o \ | |
| ddt.o \ | |
| ddt_zap.o \ | |
| diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c | |
| index 5f7d76f..b2b9887 100644 | |
| --- a/module/zfs/bptree.c | |
| +++ b/module/zfs/bptree.c | |
| @@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| int err; | |
| struct bptree_args *ba = arg; | |
| - if (BP_IS_HOLE(bp)) | |
| + if (bp == NULL || BP_IS_HOLE(bp)) | |
| return (0); | |
| err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); | |
| diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c | |
| new file mode 100644 | |
| index 0000000..1ddc697 | |
| --- /dev/null | |
| +++ b/module/zfs/bqueue.c | |
| @@ -0,0 +1,111 @@ | |
| +/* | |
| + * CDDL HEADER START | |
| + * | |
| + * This file and its contents are supplied under the terms of the | |
| + * Common Development and Distribution License ("CDDL"), version 1.0. | |
| + * You may only use this file in accordance with the terms of version | |
| + * 1.0 of the CDDL. | |
| + * | |
| + * A full copy of the text of the CDDL should have accompanied this | |
| + * source. A copy of the CDDL is also available via the Internet at | |
| + * http://www.illumos.org/license/CDDL. | |
| + * | |
| + * CDDL HEADER END | |
| + */ | |
| +/* | |
| + * Copyright (c) 2014 by Delphix. All rights reserved. | |
| + */ | |
| + | |
| +#include <sys/bqueue.h> | |
| +#include <sys/zfs_context.h> | |
| + | |
| +static inline bqueue_node_t * | |
| +obj2node(bqueue_t *q, void *data) | |
| +{ | |
| + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); | |
| +} | |
| + | |
| +/* | |
| + * Initialize a blocking queue The maximum capacity of the queue is set to | |
| + * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, | |
| + * and offset should give its offset from the start of the struct. Return 0 on | |
| + * success, or -1 on failure. | |
| + */ | |
| +int | |
| +bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) | |
| +{ | |
| + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), | |
| + node_offset + offsetof(bqueue_node_t, bqn_node)); | |
| + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); | |
| + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); | |
| + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); | |
| + q->bq_node_offset = node_offset; | |
| + q->bq_size = 0; | |
| + q->bq_maxsize = size; | |
| + return (0); | |
| +} | |
| + | |
| +/* | |
| + * Destroy a blocking queue. This function asserts that there are no | |
| + * elements in the queue, and no one is blocked on the condition | |
| + * variables. | |
| + */ | |
| +void | |
| +bqueue_destroy(bqueue_t *q) | |
| +{ | |
| + ASSERT0(q->bq_size); | |
| + cv_destroy(&q->bq_add_cv); | |
| + cv_destroy(&q->bq_pop_cv); | |
| + mutex_destroy(&q->bq_lock); | |
| + list_destroy(&q->bq_list); | |
| +} | |
| + | |
| +/* | |
| + * Add data to q, consuming size units of capacity. If there is insufficient | |
| + * capacity to consume size units, block until capacity exists. Asserts size is | |
| + * > 0. | |
| + */ | |
| +void | |
| +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) | |
| +{ | |
| + ASSERT3U(item_size, >, 0); | |
| + ASSERT3U(item_size, <, q->bq_maxsize); | |
| + mutex_enter(&q->bq_lock); | |
| + obj2node(q, data)->bqn_size = item_size; | |
| + while (q->bq_size + item_size > q->bq_maxsize) { | |
| + cv_wait(&q->bq_add_cv, &q->bq_lock); | |
| + } | |
| + q->bq_size += item_size; | |
| + list_insert_tail(&q->bq_list, data); | |
| + cv_signal(&q->bq_pop_cv); | |
| + mutex_exit(&q->bq_lock); | |
| +} | |
| +/* | |
| + * Take the first element off of q. If there are no elements on the queue, wait | |
| + * until one is put there. Return the removed element. | |
| + */ | |
| +void * | |
| +bqueue_dequeue(bqueue_t *q) | |
| +{ | |
| + void *ret; | |
| + uint64_t item_size; | |
| + mutex_enter(&q->bq_lock); | |
| + while (q->bq_size == 0) { | |
| + cv_wait(&q->bq_pop_cv, &q->bq_lock); | |
| + } | |
| + ret = list_remove_head(&q->bq_list); | |
| + item_size = obj2node(q, ret)->bqn_size; | |
| + q->bq_size -= item_size; | |
| + mutex_exit(&q->bq_lock); | |
| + cv_signal(&q->bq_add_cv); | |
| + return (ret); | |
| +} | |
| + | |
| +/* | |
| + * Returns true if the space used is 0. | |
| + */ | |
| +boolean_t | |
| +bqueue_empty(bqueue_t *q) | |
| +{ | |
| + return (q->bq_size == 0); | |
| +} | |
| diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c | |
| index 7914800..623d57a 100644 | |
| --- a/module/zfs/dbuf.c | |
| +++ b/module/zfs/dbuf.c | |
| @@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) | |
| return (abuf); | |
| } | |
| +/* | |
| + * Calculate which level n block references the data at the level 0 offset | |
| + * provided. | |
| + */ | |
| uint64_t | |
| -dbuf_whichblock(dnode_t *dn, uint64_t offset) | |
| +dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) | |
| { | |
| - if (dn->dn_datablkshift) { | |
| - return (offset >> dn->dn_datablkshift); | |
| + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { | |
| + /* | |
| + * The level n blkid is equal to the level 0 blkid divided by | |
| + * the number of level 0s in a level n block. | |
| + * | |
| + * The level 0 blkid is offset >> datablkshift = | |
| + * offset / 2^datablkshift. | |
| + * | |
| + * The number of level 0s in a level n is the number of block | |
| + * pointers in an indirect block, raised to the power of level. | |
| + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = | |
| + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). | |
| + * | |
| + * Thus, the level n blkid is: offset / | |
| + * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) | |
| + * = offset / 2^(datablkshift + level * | |
| + * (indblkshift - SPA_BLKPTRSHIFT)) | |
| + * = offset >> (datablkshift + level * | |
| + * (indblkshift - SPA_BLKPTRSHIFT)) | |
| + */ | |
| + return (offset >> (dn->dn_datablkshift + level * | |
| + (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); | |
| } else { | |
| ASSERT3U(offset, <, dn->dn_datablksz); | |
| return (0); | |
| @@ -1715,6 +1739,12 @@ dbuf_clear(dmu_buf_impl_t *db) | |
| dbuf_rele(parent, db); | |
| } | |
| +/* | |
| + * Note: While bpp will always be updated if the function returns success, | |
| + * parentp will not be updated if the dnode does not have dn_dbuf filled in; | |
| + * this happens when the dnode is the meta-dnode, or a userused or groupused | |
| + * object. | |
| + */ | |
| static int | |
| dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, | |
| dmu_buf_impl_t **parentp, blkptr_t **bpp) | |
| @@ -1755,7 +1785,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, | |
| } else if (level < nlevels-1) { | |
| /* this block is referenced from an indirect block */ | |
| int err = dbuf_hold_impl(dn, level+1, | |
| - blkid >> epbs, fail_sparse, NULL, parentp); | |
| + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); | |
| if (err) | |
| return (err); | |
| err = dbuf_read(*parentp, NULL, | |
| @@ -1930,11 +1960,96 @@ dbuf_destroy(dmu_buf_impl_t *db) | |
| arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); | |
| } | |
| +typedef struct dbuf_prefetch_arg { | |
| + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ | |
| + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ | |
| + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ | |
| + int dpa_curlevel; /* The current level that we're reading */ | |
| + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ | |
| + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ | |
| + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ | |
| +} dbuf_prefetch_arg_t; | |
| + | |
| +/* | |
| + * Actually issue the prefetch read for the block given. | |
| + */ | |
| +static void | |
| +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) | |
| +{ | |
| + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| + return; | |
| + | |
| + arc_flags_t aflags = | |
| + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | |
| + | |
| + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); | |
| + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); | |
| + ASSERT(dpa->dpa_zio != NULL); | |
| + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, | |
| + dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | |
| + &aflags, &dpa->dpa_zb); | |
| +} | |
| + | |
| +/* | |
| + * Called when an indirect block above our prefetch target is read in. This | |
| + * will either read in the next indirect block down the tree or issue the actual | |
| + * prefetch if the next block down is our target. | |
| + */ | |
| +static void | |
| +dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) | |
| +{ | |
| + dbuf_prefetch_arg_t *dpa = private; | |
| + | |
| + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); | |
| + ASSERT3S(dpa->dpa_curlevel, >, 0); | |
| + if (zio != NULL) { | |
| + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); | |
| + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); | |
| + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); | |
| + } | |
| + | |
| + dpa->dpa_curlevel--; | |
| + | |
| + uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> | |
| + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); | |
| + blkptr_t *bp = ((blkptr_t *)abuf->b_data) + | |
| + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); | |
| + if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { | |
| + kmem_free(dpa, sizeof (*dpa)); | |
| + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { | |
| + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); | |
| + dbuf_issue_final_prefetch(dpa, bp); | |
| + kmem_free(dpa, sizeof (*dpa)); | |
| + } else { | |
| + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; | |
| + zbookmark_phys_t zb; | |
| + | |
| + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); | |
| + | |
| + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, | |
| + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); | |
| + | |
| + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, | |
| + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, | |
| + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | |
| + &iter_aflags, &zb); | |
| + } | |
| + (void) arc_buf_remove_ref(abuf, private); | |
| +} | |
| + | |
| +/* | |
| + * Issue prefetch reads for the given block on the given level. If the indirect | |
| + * blocks above that block are not in memory, we will read them in | |
| + * asynchronously. As a result, this call never blocks waiting for a read to | |
| + * complete. | |
| + */ | |
| void | |
| -dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) | |
| +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, | |
| + arc_flags_t aflags) | |
| { | |
| - dmu_buf_impl_t *db = NULL; | |
| - blkptr_t *bp = NULL; | |
| + blkptr_t bp; | |
| + int epbs, nlevels, curlevel; | |
| + uint64_t curblkid; | |
| ASSERT(blkid != DMU_BONUS_BLKID); | |
| ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); | |
| @@ -1942,35 +2057,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) | |
| if (dnode_block_freed(dn, blkid)) | |
| return; | |
| - /* dbuf_find() returns with db_mtx held */ | |
| - if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) { | |
| + /* | |
| + * This dnode hasn't been written to disk yet, so there's nothing to | |
| + * prefetch. | |
| + */ | |
| + nlevels = dn->dn_phys->dn_nlevels; | |
| + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) | |
| + return; | |
| + | |
| + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; | |
| + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) | |
| + return; | |
| + | |
| + dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, | |
| + level, blkid); | |
| + if (db != NULL) { | |
| + mutex_exit(&db->db_mtx); | |
| /* | |
| - * This dbuf is already in the cache. We assume that | |
| - * it is already CACHED, or else about to be either | |
| - * read or filled. | |
| + * This dbuf already exists. It is either CACHED, or | |
| + * (we assume) about to be read or filled. | |
| */ | |
| - mutex_exit(&db->db_mtx); | |
| return; | |
| } | |
| - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { | |
| - if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { | |
| - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; | |
| - arc_flags_t aflags = | |
| - ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | |
| - zbookmark_phys_t zb; | |
| + /* | |
| + * Find the closest ancestor (indirect block) of the target block | |
| + * that is present in the cache. In this indirect block, we will | |
| + * find the bp that is at curlevel, curblkid. | |
| + */ | |
| + curlevel = level; | |
| + curblkid = blkid; | |
| + while (curlevel < nlevels - 1) { | |
| + int parent_level = curlevel + 1; | |
| + uint64_t parent_blkid = curblkid >> epbs; | |
| + dmu_buf_impl_t *db; | |
| + | |
| + if (dbuf_hold_impl(dn, parent_level, parent_blkid, | |
| + FALSE, TRUE, FTAG, &db) == 0) { | |
| + blkptr_t *bpp = db->db_buf->b_data; | |
| + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; | |
| + dbuf_rele(db, FTAG); | |
| + break; | |
| + } | |
| - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, | |
| - dn->dn_object, 0, blkid); | |
| + curlevel = parent_level; | |
| + curblkid = parent_blkid; | |
| + } | |
| - (void) arc_read(NULL, dn->dn_objset->os_spa, | |
| - bp, NULL, NULL, prio, | |
| - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | |
| - &aflags, &zb); | |
| - } | |
| - if (db) | |
| - dbuf_rele(db, NULL); | |
| + if (curlevel == nlevels - 1) { | |
| + /* No cached indirect blocks found. */ | |
| + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); | |
| + bp = dn->dn_phys->dn_blkptr[curblkid]; | |
| } | |
| + if (BP_IS_HOLE(&bp)) | |
| + return; | |
| + | |
| + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); | |
| + | |
| + zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, | |
| + ZIO_FLAG_CANFAIL); | |
| + | |
| + dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); | |
| + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; | |
| + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, | |
| + dn->dn_object, level, blkid); | |
| + dpa->dpa_curlevel = curlevel; | |
| + dpa->dpa_prio = prio; | |
| + dpa->dpa_aflags = aflags; | |
| + dpa->dpa_spa = dn->dn_objset->os_spa; | |
| + dpa->dpa_epbs = epbs; | |
| + dpa->dpa_zio = pio; | |
| + | |
| + /* | |
| + * If we have the indirect just above us, no need to do the asynchronous | |
| + * prefetch chain; we'll just run the last step ourselves. If we're at | |
| + * a higher level, though, we want to issue the prefetches for all the | |
| + * indirect blocks asynchronously, so we can go on with whatever we were | |
| + * doing. | |
| + */ | |
| + if (curlevel == level) { | |
| + ASSERT3U(curblkid, ==, blkid); | |
| + dbuf_issue_final_prefetch(dpa, &bp); | |
| + kmem_free(dpa, sizeof (*dpa)); | |
| + } else { | |
| + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; | |
| + zbookmark_phys_t zb; | |
| + | |
| + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, | |
| + dn->dn_object, curlevel, curblkid); | |
| + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, | |
| + &bp, dbuf_prefetch_indirect_done, dpa, prio, | |
| + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | |
| + &iter_aflags, &zb); | |
| + } | |
| + /* | |
| + * We use pio here instead of dpa_zio since it's possible that | |
| + * dpa may have already been freed. | |
| + */ | |
| + zio_nowait(pio); | |
| } | |
| /* | |
| @@ -1978,7 +2162,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) | |
| * Note: dn_struct_rwlock must be held. | |
| */ | |
| int | |
| -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | |
| +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, | |
| + boolean_t fail_sparse, boolean_t fail_uncached, | |
| void *tag, dmu_buf_impl_t **dbp) | |
| { | |
| dmu_buf_impl_t *db, *parent = NULL; | |
| @@ -1996,6 +2181,9 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | |
| blkptr_t *bp = NULL; | |
| int err; | |
| + if (fail_uncached) | |
| + return (SET_ERROR(ENOENT)); | |
| + | |
| ASSERT3P(parent, ==, NULL); | |
| err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); | |
| if (fail_sparse) { | |
| @@ -2012,6 +2200,11 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | |
| db = dbuf_create(dn, level, blkid, parent, bp); | |
| } | |
| + if (fail_uncached && db->db_state != DB_CACHED) { | |
| + mutex_exit(&db->db_mtx); | |
| + return (SET_ERROR(ENOENT)); | |
| + } | |
| + | |
| if (db->db_buf && refcount_is_zero(&db->db_holds)) { | |
| arc_buf_add_ref(db->db_buf, db); | |
| if (db->db_buf->b_data == NULL) { | |
| @@ -2067,16 +2260,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | |
| dmu_buf_impl_t * | |
| dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) | |
| { | |
| - dmu_buf_impl_t *db; | |
| - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); | |
| - return (err ? NULL : db); | |
| + return (dbuf_hold_level(dn, 0, blkid, tag)); | |
| } | |
| dmu_buf_impl_t * | |
| dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) | |
| { | |
| dmu_buf_impl_t *db; | |
| - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); | |
| + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); | |
| return (err ? NULL : db); | |
| } | |
| @@ -2429,8 +2620,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) | |
| if (parent == NULL) { | |
| mutex_exit(&db->db_mtx); | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| - (void) dbuf_hold_impl(dn, db->db_level+1, | |
| - db->db_blkid >> epbs, FALSE, db, &parent); | |
| + parent = dbuf_hold_level(dn, db->db_level + 1, | |
| + db->db_blkid >> epbs, db); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| mutex_enter(&db->db_mtx); | |
| db->db_parent = parent; | |
| diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c | |
| index 43bcfee..582542e 100644 | |
| --- a/module/zfs/dmu.c | |
| +++ b/module/zfs/dmu.c | |
| @@ -138,7 +138,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, | |
| err = dnode_hold(os, object, FTAG, &dn); | |
| if (err) | |
| return (err); | |
| - blkid = dbuf_whichblock(dn, offset); | |
| + blkid = dbuf_whichblock(dn, 0, offset); | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| db = dbuf_hold(dn, blkid, tag); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| @@ -421,7 +421,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, | |
| dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); | |
| zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); | |
| - blkid = dbuf_whichblock(dn, offset); | |
| + blkid = dbuf_whichblock(dn, 0, offset); | |
| for (i = 0; i < nblks; i++) { | |
| dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); | |
| if (db == NULL) { | |
| @@ -522,17 +522,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) | |
| } | |
| /* | |
| - * Issue prefetch i/os for the given blocks. | |
| + * Issue prefetch i/os for the given blocks. If level is greater than 0, the | |
| + * indirect blocks prefeteched will be those that point to the blocks containing | |
| + * the data starting at offset, and continuing to offset + len. | |
| * | |
| - * Note: The assumption is that we *know* these blocks will be needed | |
| - * almost immediately. Therefore, the prefetch i/os will be issued at | |
| - * ZIO_PRIORITY_SYNC_READ | |
| - * | |
| - * Note: indirect blocks and other metadata will be read synchronously, | |
| - * causing this function to block if they are not already cached. | |
| + * Note that if the indirect blocks above the blocks being prefetched are not in | |
| + * cache, they will be asychronously read in. | |
| */ | |
| void | |
| -dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |
| +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, | |
| + uint64_t len, zio_priority_t pri) | |
| { | |
| dnode_t *dn; | |
| uint64_t blkid; | |
| @@ -548,8 +547,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |
| return; | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); | |
| - dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); | |
| + blkid = dbuf_whichblock(dn, level, | |
| + object * sizeof (dnode_phys_t)); | |
| + dbuf_prefetch(dn, level, blkid, pri, 0); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| return; | |
| } | |
| @@ -564,18 +564,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |
| return; | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| - if (dn->dn_datablkshift) { | |
| - int blkshift = dn->dn_datablkshift; | |
| - nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - | |
| - P2ALIGN(offset, 1 << blkshift)) >> blkshift; | |
| + /* | |
| + * offset + len - 1 is the last byte we want to prefetch for, and offset | |
| + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the | |
| + * last block we want to prefetch, and dbuf_whichblock(dn, level, | |
| + * offset) is the first. Then the number we need to prefetch is the | |
| + * last - first + 1. | |
| + */ | |
| + if (level > 0 || dn->dn_datablkshift != 0) { | |
| + nblks = dbuf_whichblock(dn, level, offset + len - 1) - | |
| + dbuf_whichblock(dn, level, offset) + 1; | |
| } else { | |
| nblks = (offset < dn->dn_datablksz); | |
| } | |
| if (nblks != 0) { | |
| - blkid = dbuf_whichblock(dn, offset); | |
| + blkid = dbuf_whichblock(dn, level, offset); | |
| for (int i = 0; i < nblks; i++) | |
| - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); | |
| + dbuf_prefetch(dn, level, blkid + i, pri, 0); | |
| } | |
| rw_exit(&dn->dn_struct_rwlock); | |
| @@ -1325,7 +1331,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, | |
| DB_DNODE_ENTER(dbuf); | |
| dn = DB_DNODE(dbuf); | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| - blkid = dbuf_whichblock(dn, offset); | |
| + blkid = dbuf_whichblock(dn, 0, offset); | |
| VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| DB_DNODE_EXIT(dbuf); | |
| diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c | |
| index 91415d0..7665d1c 100644 | |
| --- a/module/zfs/dmu_diff.c | |
| +++ b/module/zfs/dmu_diff.c | |
| @@ -115,7 +115,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| if (issig(JUSTLOOKING) && issig(FORREAL)) | |
| return (SET_ERROR(EINTR)); | |
| - if (zb->zb_object != DMU_META_DNODE_OBJECT) | |
| + if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) | |
| return (0); | |
| if (BP_IS_HOLE(bp)) { | |
| diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c | |
| index 808864a..6ca021e 100644 | |
| --- a/module/zfs/dmu_object.c | |
| +++ b/module/zfs/dmu_object.c | |
| @@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) | |
| return (0); | |
| } | |
| +/* | |
| + * Return (in *objectp) the next object which is allocated (or a hole) | |
| + * after *object, taking into account only objects that may have been modified | |
| + * after the specified txg. | |
| + */ | |
| int | |
| dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) | |
| { | |
| diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c | |
| index 7d728c6..ca09d10 100644 | |
| --- a/module/zfs/dmu_send.c | |
| +++ b/module/zfs/dmu_send.c | |
| @@ -52,13 +52,38 @@ | |
| #include <sys/blkptr.h> | |
| #include <sys/dsl_bookmark.h> | |
| #include <sys/zfeature.h> | |
| +#include <sys/bqueue.h> | |
| /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ | |
| int zfs_send_corrupt_data = B_FALSE; | |
| +int zfs_send_queue_length = 16 * 1024 * 1024; | |
| +int zfs_recv_queue_length = 16 * 1024 * 1024; | |
| static char *dmu_recv_tag = "dmu_recv_tag"; | |
| static const char *recv_clone_name = "%recv"; | |
| +#define BP_SPAN(datablkszsec, indblkshift, level) \ | |
| + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ | |
| + (level) * (indblkshift - SPA_BLKPTRSHIFT))) | |
| + | |
| +struct send_thread_arg { | |
| + bqueue_t q; | |
| + dsl_dataset_t *ds; /* Dataset to traverse */ | |
| + uint64_t fromtxg; /* Traverse from this txg */ | |
| + int flags; /* flags to pass to traverse_dataset */ | |
| + int error_code; | |
| + boolean_t cancel; | |
| +}; | |
| + | |
| +struct send_block_record { | |
| + boolean_t eos_marker; /* Marks the end of the stream */ | |
| + blkptr_t bp; | |
| + zbookmark_phys_t zb; | |
| + uint8_t indblkshift; | |
| + uint16_t datablkszsec; | |
| + bqueue_node_t ln; | |
| +}; | |
| + | |
| static int | |
| dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) | |
| { | |
| @@ -434,58 +459,115 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) | |
| return (B_FALSE); | |
| } | |
| -#define BP_SPAN(dnp, level) \ | |
| - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ | |
| - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) | |
| +/* | |
| + * This is the callback function to traverse_dataset that acts as the worker | |
| + * thread for dmu_send_impl. | |
| + */ | |
| +/*ARGSUSED*/ | |
| +static int | |
| +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) | |
| +{ | |
| + struct send_thread_arg *sta = arg; | |
| + struct send_block_record *record; | |
| + uint64_t record_size; | |
| + int err = 0; | |
| -/* ARGSUSED */ | |
| + if (sta->cancel) | |
| + return (SET_ERROR(EINTR)); | |
| + | |
| + if (bp == NULL) { | |
| + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); | |
| + return (0); | |
| + } else if (zb->zb_level < 0) { | |
| + return (0); | |
| + } | |
| + | |
| + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); | |
| + record->eos_marker = B_FALSE; | |
| + record->bp = *bp; | |
| + record->zb = *zb; | |
| + record->indblkshift = dnp->dn_indblkshift; | |
| + record->datablkszsec = dnp->dn_datablkszsec; | |
| + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; | |
| + bqueue_enqueue(&sta->q, record, record_size); | |
| + | |
| + return (err); | |
| +} | |
| + | |
| +/* | |
| + * This function kicks off the traverse_dataset. It also handles setting the | |
| + * error code of the thread in case something goes wrong, and pushes the End of | |
| + * Stream record when the traverse_dataset call has finished. If there is no | |
| + * dataset to traverse, the thread immediately pushes End of Stream marker. | |
| + */ | |
| +static void | |
| +send_traverse_thread(void *arg) | |
| +{ | |
| + struct send_thread_arg *st_arg = arg; | |
| + int err; | |
| + struct send_block_record *data; | |
| + | |
| + if (st_arg->ds != NULL) { | |
| + err = traverse_dataset(st_arg->ds, st_arg->fromtxg, | |
| + st_arg->flags, send_cb, arg); | |
| + if (err != EINTR) | |
| + st_arg->error_code = err; | |
| + } | |
| + data = kmem_zalloc(sizeof (*data), KM_SLEEP); | |
| + data->eos_marker = B_TRUE; | |
| + bqueue_enqueue(&st_arg->q, data, 1); | |
| +} | |
| + | |
| +/* | |
| + * This function actually handles figuring out what kind of record needs to be | |
| + * dumped, reading the data (which has hopefully been prefetched), and calling | |
| + * the appropriate helper function. | |
| + */ | |
| static int | |
| -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) | |
| +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) | |
| { | |
| - dmu_sendarg_t *dsp = arg; | |
| + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); | |
| + const blkptr_t *bp = &data->bp; | |
| + const zbookmark_phys_t *zb = &data->zb; | |
| + uint8_t indblkshift = data->indblkshift; | |
| + uint16_t dblkszsec = data->datablkszsec; | |
| + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; | |
| dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; | |
| int err = 0; | |
| - if (issig(JUSTLOOKING) && issig(FORREAL)) | |
| - return (SET_ERROR(EINTR)); | |
| + ASSERT3U(zb->zb_level, >=, 0); | |
| if (zb->zb_object != DMU_META_DNODE_OBJECT && | |
| DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { | |
| return (0); | |
| - } else if (zb->zb_level == ZB_ZIL_LEVEL) { | |
| - /* | |
| - * If we are sending a non-snapshot (which is allowed on | |
| - * read-only pools), it may have a ZIL, which must be ignored. | |
| - */ | |
| - return (0); | |
| } else if (BP_IS_HOLE(bp) && | |
| zb->zb_object == DMU_META_DNODE_OBJECT) { | |
| - uint64_t span = BP_SPAN(dnp, zb->zb_level); | |
| + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); | |
| uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; | |
| - err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); | |
| + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); | |
| } else if (BP_IS_HOLE(bp)) { | |
| - uint64_t span = BP_SPAN(dnp, zb->zb_level); | |
| - err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); | |
| + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); | |
| + uint64_t offset = zb->zb_blkid * span; | |
| + err = dump_free(dsa, zb->zb_object, offset, span); | |
| } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { | |
| return (0); | |
| } else if (type == DMU_OT_DNODE) { | |
| - dnode_phys_t *blk; | |
| - int i; | |
| int blksz = BP_GET_LSIZE(bp); | |
| arc_flags_t aflags = ARC_FLAG_WAIT; | |
| arc_buf_t *abuf; | |
| + ASSERT0(zb->zb_level); | |
| + | |
| if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, | |
| ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, | |
| &aflags, zb) != 0) | |
| return (SET_ERROR(EIO)); | |
| - blk = abuf->b_data; | |
| - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { | |
| - uint64_t dnobj = (zb->zb_blkid << | |
| - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; | |
| - err = dump_dnode(dsp, dnobj, blk+i); | |
| + dnode_phys_t *blk = abuf->b_data; | |
| + uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); | |
| + for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { | |
| + err = dump_dnode(dsa, dnobj + i, blk + i); | |
| if (err != 0) | |
| break; | |
| } | |
| @@ -500,20 +582,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| &aflags, zb) != 0) | |
| return (SET_ERROR(EIO)); | |
| - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); | |
| + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); | |
| (void) arc_buf_remove_ref(abuf, &abuf); | |
| - } else if (backup_do_embed(dsp, bp)) { | |
| + } else if (backup_do_embed(dsa, bp)) { | |
| /* it's an embedded level-0 block of a regular object */ | |
| - int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; | |
| - err = dump_write_embedded(dsp, zb->zb_object, | |
| + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; | |
| + ASSERT0(zb->zb_level); | |
| + err = dump_write_embedded(dsa, zb->zb_object, | |
| zb->zb_blkid * blksz, blksz, bp); | |
| - } else { /* it's a level-0 block of a regular object */ | |
| + } else { | |
| + /* it's a level-0 block of a regular object */ | |
| arc_flags_t aflags = ARC_FLAG_WAIT; | |
| arc_buf_t *abuf; | |
| - int blksz = BP_GET_LSIZE(bp); | |
| + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; | |
| uint64_t offset; | |
| - ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); | |
| ASSERT0(zb->zb_level); | |
| if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, | |
| ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, | |
| @@ -534,20 +617,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| offset = zb->zb_blkid * blksz; | |
| - if (!(dsp->dsa_featureflags & | |
| + if (!(dsa->dsa_featureflags & | |
| DMU_BACKUP_FEATURE_LARGE_BLOCKS) && | |
| blksz > SPA_OLD_MAXBLOCKSIZE) { | |
| char *buf = abuf->b_data; | |
| while (blksz > 0 && err == 0) { | |
| int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); | |
| - err = dump_write(dsp, type, zb->zb_object, | |
| + err = dump_write(dsa, type, zb->zb_object, | |
| offset, n, NULL, buf); | |
| offset += n; | |
| buf += n; | |
| blksz -= n; | |
| } | |
| } else { | |
| - err = dump_write(dsp, type, zb->zb_object, | |
| + err = dump_write(dsa, type, zb->zb_object, | |
| offset, blksz, bp, abuf->b_data); | |
| } | |
| (void) arc_buf_remove_ref(abuf, &abuf); | |
| @@ -558,11 +641,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| } | |
| /* | |
| - * Releases dp using the specified tag. | |
| + * Pop the new data off the queue, and free the old data. | |
| + */ | |
| +static struct send_block_record * | |
| +get_next_record(bqueue_t *bq, struct send_block_record *data) | |
| +{ | |
| + struct send_block_record *tmp = bqueue_dequeue(bq); | |
| + kmem_free(data, sizeof (*data)); | |
| + return (tmp); | |
| +} | |
| + | |
| +/* | |
| + * Actually do the bulk of the work in a zfs send. | |
| + * | |
| + * Note: Releases dp using the specified tag. | |
| */ | |
| static int | |
| -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, | |
| - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, | |
| +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, | |
| + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, | |
| boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) | |
| { | |
| objset_t *os; | |
| @@ -571,8 +667,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, | |
| int err; | |
| uint64_t fromtxg = 0; | |
| uint64_t featureflags = 0; | |
| + struct send_thread_arg to_arg; | |
| - err = dmu_objset_from_ds(ds, &os); | |
| + err = dmu_objset_from_ds(to_ds, &os); | |
| if (err != 0) { | |
| dsl_pool_rele(dp, tag); | |
| return (err); | |
| @@ -598,35 +695,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, | |
| } | |
| #endif | |
| - if (large_block_ok && ds->ds_large_blocks) | |
| + if (large_block_ok && to_ds->ds_large_blocks) | |
| featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; | |
| if (embedok && | |
| spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { | |
| featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; | |
| if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) | |
| featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; | |
| - } else { | |
| - embedok = B_FALSE; | |
| } | |
| DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, | |
| featureflags); | |
| drr->drr_u.drr_begin.drr_creation_time = | |
| - dsl_dataset_phys(ds)->ds_creation_time; | |
| + dsl_dataset_phys(to_ds)->ds_creation_time; | |
| drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); | |
| if (is_clone) | |
| drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; | |
| - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; | |
| - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) | |
| + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; | |
| + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) | |
| drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; | |
| - if (fromzb != NULL) { | |
| - drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; | |
| - fromtxg = fromzb->zbm_creation_txg; | |
| + if (ancestor_zb != NULL) { | |
| + drr->drr_u.drr_begin.drr_fromguid = | |
| + ancestor_zb->zbm_guid; | |
| + fromtxg = ancestor_zb->zbm_creation_txg; | |
| } | |
| - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); | |
| - if (!ds->ds_is_snapshot) { | |
| + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); | |
| + if (!to_ds->ds_is_snapshot) { | |
| (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", | |
| sizeof (drr->drr_u.drr_begin.drr_toname)); | |
| } | |
| @@ -639,16 +735,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, | |
| dsp->dsa_proc = curproc; | |
| dsp->dsa_os = os; | |
| dsp->dsa_off = off; | |
| - dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; | |
| + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; | |
| dsp->dsa_pending_op = PENDING_NONE; | |
| - dsp->dsa_incremental = (fromzb != NULL); | |
| + dsp->dsa_incremental = (ancestor_zb != NULL); | |
| dsp->dsa_featureflags = featureflags; | |
| - mutex_enter(&ds->ds_sendstream_lock); | |
| - list_insert_head(&ds->ds_sendstreams, dsp); | |
| - mutex_exit(&ds->ds_sendstream_lock); | |
| + mutex_enter(&to_ds->ds_sendstream_lock); | |
| + list_insert_head(&to_ds->ds_sendstreams, dsp); | |
| + mutex_exit(&to_ds->ds_sendstream_lock); | |
| - dsl_dataset_long_hold(ds, FTAG); | |
| + dsl_dataset_long_hold(to_ds, FTAG); | |
| dsl_pool_rele(dp, tag); | |
| if (dump_record(dsp, NULL, 0) != 0) { | |
| @@ -656,8 +752,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, | |
| goto out; | |
| } | |
| - err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, | |
| - backup_cb, dsp); | |
| + err = bqueue_init(&to_arg.q, zfs_send_queue_length, | |
| + offsetof(struct send_block_record, ln)); | |
| + to_arg.error_code = 0; | |
| + to_arg.cancel = B_FALSE; | |
| + to_arg.ds = to_ds; | |
| + to_arg.fromtxg = fromtxg; | |
| + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; | |
| + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, | |
| + TS_RUN, minclsyspri); | |
| + | |
| + struct send_block_record *to_data; | |
| + to_data = bqueue_dequeue(&to_arg.q); | |
| + | |
| + while (!to_data->eos_marker && err == 0) { | |
| + err = do_dump(dsp, to_data); | |
| + to_data = get_next_record(&to_arg.q, to_data); | |
| + if (issig(JUSTLOOKING) && issig(FORREAL)) | |
| + err = EINTR; | |
| + } | |
| + | |
| + if (err != 0) { | |
| + to_arg.cancel = B_TRUE; | |
| + while (!to_data->eos_marker) { | |
| + to_data = get_next_record(&to_arg.q, to_data); | |
| + } | |
| + } | |
| + kmem_free(to_data, sizeof (*to_data)); | |
| + | |
| + bqueue_destroy(&to_arg.q); | |
| + | |
| + if (err == 0 && to_arg.error_code != 0) | |
| + err = to_arg.error_code; | |
| + | |
| + if (err != 0) | |
| + goto out; | |
| if (dsp->dsa_pending_op != PENDING_NONE) | |
| if (dump_record(dsp, NULL, 0) != 0) | |
| @@ -674,20 +803,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, | |
| drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; | |
| drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; | |
| - if (dump_record(dsp, NULL, 0) != 0) { | |
| + if (dump_record(dsp, NULL, 0) != 0) | |
| err = dsp->dsa_err; | |
| - goto out; | |
| - } | |
| out: | |
| - mutex_enter(&ds->ds_sendstream_lock); | |
| - list_remove(&ds->ds_sendstreams, dsp); | |
| - mutex_exit(&ds->ds_sendstream_lock); | |
| + mutex_enter(&to_ds->ds_sendstream_lock); | |
| + list_remove(&to_ds->ds_sendstreams, dsp); | |
| + mutex_exit(&to_ds->ds_sendstream_lock); | |
| kmem_free(drr, sizeof (dmu_replay_record_t)); | |
| kmem_free(dsp, sizeof (dmu_sendarg_t)); | |
| - dsl_dataset_long_rele(ds, FTAG); | |
| + dsl_dataset_long_rele(to_ds, FTAG); | |
| return (err); | |
| } | |
| @@ -1110,7 +1237,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) | |
| * If it's a non-clone incremental, we are missing the | |
| * target fs, so fail the recv. | |
| */ | |
| - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) | |
| + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || | |
| + drba->drba_origin)) | |
| return (SET_ERROR(ENOENT)); | |
| /* Open the parent of tofs */ | |
| @@ -1292,21 +1420,57 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, | |
| &drba, 5, ZFS_SPACE_CHECK_NORMAL)); | |
| } | |
| -struct restorearg { | |
| +struct receive_record_arg { | |
| + dmu_replay_record_t header; | |
| + void *payload; /* Pointer to a buffer containing the payload */ | |
| + /* | |
| + * If the record is a write, pointer to the arc_buf_t containing the | |
| + * payload. | |
| + */ | |
| + arc_buf_t *write_buf; | |
| + int payload_size; | |
| + boolean_t eos_marker; /* Marks the end of the stream */ | |
| + bqueue_node_t node; | |
| +}; | |
| + | |
| +struct receive_writer_arg { | |
| objset_t *os; | |
| - int err; | |
| boolean_t byteswap; | |
| - vnode_t *vp; | |
| - uint64_t voff; | |
| - int bufsize; /* amount of memory allocated for buf */ | |
| + bqueue_t q; | |
| + /* | |
| + * These three args are used to signal to the main thread that we're | |
| + * done. | |
| + */ | |
| + kmutex_t mutex; | |
| + kcondvar_t cv; | |
| + boolean_t done; | |
| + int err; | |
| + /* A map from guid to dataset to help handle dedup'd streams. */ | |
| + avl_tree_t *guid_to_ds_map; | |
| +}; | |
| - dmu_replay_record_t *drr; | |
| - dmu_replay_record_t *next_drr; | |
| - char *buf; | |
| +struct receive_arg { | |
| + objset_t *os; | |
| + vnode_t *vp; /* The vnode to read the stream from */ | |
| + uint64_t voff; /* The current offset in the stream */ | |
| + /* | |
| + * A record that has had its payload read in, but hasn't yet been handed | |
| + * off to the worker thread. | |
| + */ | |
| + struct receive_record_arg *rrd; | |
| + /* A record that has had its header read in, but not its payload. */ | |
| + struct receive_record_arg *next_rrd; | |
| zio_cksum_t cksum; | |
| zio_cksum_t prev_cksum; | |
| + int err; | |
| + boolean_t byteswap; | |
| + /* Sorted list of objects not to issue prefetches for. */ | |
| + list_t ignore_obj_list; | |
| +}; | |
| - avl_tree_t *guid_to_ds_map; | |
| +struct receive_ign_obj_node { | |
| + list_node_t node; | |
| + uint64_t object; | |
| }; | |
| typedef struct guid_map_entry { | |
| @@ -1345,13 +1509,12 @@ free_guid_map_onexit(void *arg) | |
| } | |
| static int | |
| -restore_read(struct restorearg *ra, int len, void *buf) | |
| +receive_read(struct receive_arg *ra, int len, void *buf) | |
| { | |
| int done = 0; | |
| /* some things will require 8-byte alignment, so everything must */ | |
| ASSERT0(len % 8); | |
| - ASSERT3U(len, <=, ra->bufsize); | |
| while (done < len) { | |
| ssize_t resid; | |
| @@ -1470,7 +1633,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) | |
| } | |
| static int | |
| -restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, | |
| + void *data) | |
| { | |
| dmu_object_info_t doi; | |
| dmu_tx_t *tx; | |
| @@ -1484,12 +1648,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || | |
| P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || | |
| drro->drr_blksz < SPA_MINBLOCKSIZE || | |
| - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || | |
| + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || | |
| drro->drr_bonuslen > DN_MAX_BONUSLEN) { | |
| return (SET_ERROR(EINVAL)); | |
| } | |
| - err = dmu_object_info(ra->os, drro->drr_object, &doi); | |
| + err = dmu_object_info(rwa->os, drro->drr_object, &doi); | |
| if (err != 0 && err != ENOENT) | |
| return (SET_ERROR(EINVAL)); | |
| @@ -1508,14 +1672,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| if (drro->drr_blksz != doi.doi_data_block_size || | |
| nblkptr < doi.doi_nblkptr) { | |
| - err = dmu_free_long_range(ra->os, drro->drr_object, | |
| + err = dmu_free_long_range(rwa->os, drro->drr_object, | |
| 0, DMU_OBJECT_END); | |
| if (err != 0) | |
| return (SET_ERROR(EINVAL)); | |
| } | |
| } | |
| - tx = dmu_tx_create(ra->os); | |
| + tx = dmu_tx_create(rwa->os); | |
| dmu_tx_hold_bonus(tx, object); | |
| err = dmu_tx_assign(tx, TXG_WAIT); | |
| if (err != 0) { | |
| @@ -1525,7 +1689,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| if (object == DMU_NEW_OBJECT) { | |
| /* currently free, want to be allocated */ | |
| - err = dmu_object_claim(ra->os, drro->drr_object, | |
| + err = dmu_object_claim(rwa->os, drro->drr_object, | |
| drro->drr_type, drro->drr_blksz, | |
| drro->drr_bonustype, drro->drr_bonuslen, tx); | |
| } else if (drro->drr_type != doi.doi_type || | |
| @@ -1533,7 +1697,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| drro->drr_bonustype != doi.doi_bonus_type || | |
| drro->drr_bonuslen != doi.doi_bonus_size) { | |
| /* currently allocated, but with different properties */ | |
| - err = dmu_object_reclaim(ra->os, drro->drr_object, | |
| + err = dmu_object_reclaim(rwa->os, drro->drr_object, | |
| drro->drr_type, drro->drr_blksz, | |
| drro->drr_bonustype, drro->drr_bonuslen, tx); | |
| } | |
| @@ -1542,20 +1706,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| return (SET_ERROR(EINVAL)); | |
| } | |
| - dmu_object_set_checksum(ra->os, drro->drr_object, | |
| + dmu_object_set_checksum(rwa->os, drro->drr_object, | |
| drro->drr_checksumtype, tx); | |
| - dmu_object_set_compress(ra->os, drro->drr_object, | |
| + dmu_object_set_compress(rwa->os, drro->drr_object, | |
| drro->drr_compress, tx); | |
| if (data != NULL) { | |
| dmu_buf_t *db; | |
| - VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); | |
| + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); | |
| dmu_buf_will_dirty(db, tx); | |
| ASSERT3U(db->db_size, >=, drro->drr_bonuslen); | |
| bcopy(data, db->db_data, drro->drr_bonuslen); | |
| - if (ra->byteswap) { | |
| + if (rwa->byteswap) { | |
| dmu_object_byteswap_t byteswap = | |
| DMU_OT_BYTESWAP(drro->drr_bonustype); | |
| dmu_ot_byteswap[byteswap].ob_func(db->db_data, | |
| @@ -1569,7 +1733,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) | |
| /* ARGSUSED */ | |
| static int | |
| -restore_freeobjects(struct restorearg *ra, | |
| +receive_freeobjects(struct receive_writer_arg *rwa, | |
| struct drr_freeobjects *drrfo) | |
| { | |
| uint64_t obj; | |
| @@ -1579,13 +1743,13 @@ restore_freeobjects(struct restorearg *ra, | |
| for (obj = drrfo->drr_firstobj; | |
| obj < drrfo->drr_firstobj + drrfo->drr_numobjs; | |
| - (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { | |
| + (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { | |
| int err; | |
| - if (dmu_object_info(ra->os, obj, NULL) != 0) | |
| + if (dmu_object_info(rwa->os, obj, NULL) != 0) | |
| continue; | |
| - err = dmu_free_long_object(ra->os, obj); | |
| + err = dmu_free_long_object(rwa->os, obj); | |
| if (err != 0) | |
| return (err); | |
| } | |
| @@ -1593,7 +1757,8 @@ restore_freeobjects(struct restorearg *ra, | |
| } | |
| static int | |
| -restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) | |
| +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, | |
| + arc_buf_t *abuf) | |
| { | |
| dmu_tx_t *tx; | |
| int err; | |
| @@ -1602,10 +1767,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) | |
| !DMU_OT_IS_VALID(drrw->drr_type)) | |
| return (SET_ERROR(EINVAL)); | |
| - if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) | |
| + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) | |
| return (SET_ERROR(EINVAL)); | |
| - tx = dmu_tx_create(ra->os); | |
| + tx = dmu_tx_create(rwa->os); | |
| dmu_tx_hold_write(tx, drrw->drr_object, | |
| drrw->drr_offset, drrw->drr_length); | |
| @@ -1614,7 +1779,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) | |
| dmu_tx_abort(tx); | |
| return (err); | |
| } | |
| - if (ra->byteswap) { | |
| + if (rwa->byteswap) { | |
| dmu_object_byteswap_t byteswap = | |
| DMU_OT_BYTESWAP(drrw->drr_type); | |
| dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, | |
| @@ -1622,7 +1787,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) | |
| } | |
| dmu_buf_t *bonus; | |
| - if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) | |
| + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) | |
| return (SET_ERROR(EINVAL)); | |
| dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); | |
| dmu_tx_commit(tx); | |
| @@ -1638,7 +1803,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) | |
| * data from the stream to fulfill this write. | |
| */ | |
| static int | |
| -restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) | |
| +receive_write_byref(struct receive_writer_arg *rwa, | |
| + struct drr_write_byref *drrwbr) | |
| { | |
| dmu_tx_t *tx; | |
| int err; | |
| @@ -1657,14 +1823,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) | |
| */ | |
| if (drrwbr->drr_toguid != drrwbr->drr_refguid) { | |
| gmesrch.guid = drrwbr->drr_refguid; | |
| - if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, | |
| + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, | |
| &where)) == NULL) { | |
| return (SET_ERROR(EINVAL)); | |
| } | |
| if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) | |
| return (SET_ERROR(EINVAL)); | |
| } else { | |
| - ref_os = ra->os; | |
| + ref_os = rwa->os; | |
| } | |
| err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, | |
| @@ -1672,7 +1838,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) | |
| if (err != 0) | |
| return (err); | |
| - tx = dmu_tx_create(ra->os); | |
| + tx = dmu_tx_create(rwa->os); | |
| dmu_tx_hold_write(tx, drrwbr->drr_object, | |
| drrwbr->drr_offset, drrwbr->drr_length); | |
| @@ -1681,7 +1847,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) | |
| dmu_tx_abort(tx); | |
| return (err); | |
| } | |
| - dmu_write(ra->os, drrwbr->drr_object, | |
| + dmu_write(rwa->os, drrwbr->drr_object, | |
| drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); | |
| dmu_buf_rele(dbp, FTAG); | |
| dmu_tx_commit(tx); | |
| @@ -1689,7 +1855,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) | |
| } | |
| static int | |
| -restore_write_embedded(struct restorearg *ra, | |
| +receive_write_embedded(struct receive_writer_arg *rwa, | |
| struct drr_write_embedded *drrwnp, void *data) | |
| { | |
| dmu_tx_t *tx; | |
| @@ -1706,7 +1872,7 @@ restore_write_embedded(struct restorearg *ra, | |
| if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) | |
| return (EINVAL); | |
| - tx = dmu_tx_create(ra->os); | |
| + tx = dmu_tx_create(rwa->os); | |
| dmu_tx_hold_write(tx, drrwnp->drr_object, | |
| drrwnp->drr_offset, drrwnp->drr_length); | |
| @@ -1716,36 +1882,37 @@ restore_write_embedded(struct restorearg *ra, | |
| return (err); | |
| } | |
| - dmu_write_embedded(ra->os, drrwnp->drr_object, | |
| + dmu_write_embedded(rwa->os, drrwnp->drr_object, | |
| drrwnp->drr_offset, data, drrwnp->drr_etype, | |
| drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, | |
| - ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); | |
| + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); | |
| dmu_tx_commit(tx); | |
| return (0); | |
| } | |
| static int | |
| -restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) | |
| +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, | |
| + void *data) | |
| { | |
| dmu_tx_t *tx; | |
| dmu_buf_t *db, *db_spill; | |
| int err; | |
| if (drrs->drr_length < SPA_MINBLOCKSIZE || | |
| - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) | |
| + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) | |
| return (SET_ERROR(EINVAL)); | |
| - if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) | |
| + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) | |
| return (SET_ERROR(EINVAL)); | |
| - VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); | |
| + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); | |
| if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { | |
| dmu_buf_rele(db, FTAG); | |
| return (err); | |
| } | |
| - tx = dmu_tx_create(ra->os); | |
| + tx = dmu_tx_create(rwa->os); | |
| dmu_tx_hold_spill(tx, db->db_object); | |
| @@ -1772,7 +1939,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) | |
| /* ARGSUSED */ | |
| static int | |
| -restore_free(struct restorearg *ra, struct drr_free *drrf) | |
| +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) | |
| { | |
| int err; | |
| @@ -1780,11 +1947,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf) | |
| drrf->drr_offset + drrf->drr_length < drrf->drr_offset) | |
| return (SET_ERROR(EINVAL)); | |
| - if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) | |
| + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) | |
| return (SET_ERROR(EINVAL)); | |
| - err = dmu_free_long_range(ra->os, drrf->drr_object, | |
| + err = dmu_free_long_range(rwa->os, drrf->drr_object, | |
| drrf->drr_offset, drrf->drr_length); | |
| + | |
| return (err); | |
| } | |
| @@ -1799,7 +1967,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) | |
| } | |
| static void | |
| -restore_cksum(struct restorearg *ra, int len, void *buf) | |
| +receive_cksum(struct receive_arg *ra, int len, void *buf) | |
| { | |
| if (ra->byteswap) { | |
| fletcher_4_incremental_byteswap(buf, len, &ra->cksum); | |
| @@ -1809,30 +1977,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf) | |
| } | |
| /* | |
| - * If len != 0, read payload into buf. | |
| - * Read next record's header into ra->next_drr. | |
| + * Read the payload into a buffer of size len, and update the current record's | |
| + * payload field. | |
| + * Allocate ra->next_rrd and read the next record's header into | |
| + * ra->next_rrd->header. | |
| * Verify checksum of payload and next record. | |
| */ | |
| static int | |
| -restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) | |
| +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) | |
| { | |
| int err; | |
| if (len != 0) { | |
| - ASSERT3U(len, <=, ra->bufsize); | |
| - err = restore_read(ra, len, buf); | |
| + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); | |
| + ra->rrd->payload = buf; | |
| + ra->rrd->payload_size = len; | |
| + err = receive_read(ra, len, ra->rrd->payload); | |
| if (err != 0) | |
| return (err); | |
| - restore_cksum(ra, len, buf); | |
| + receive_cksum(ra, len, ra->rrd->payload); | |
| } | |
| ra->prev_cksum = ra->cksum; | |
| - err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); | |
| - if (err != 0) | |
| + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); | |
| + err = receive_read(ra, sizeof (ra->next_rrd->header), | |
| + &ra->next_rrd->header); | |
| + if (err != 0) { | |
| + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); | |
| + ra->next_rrd = NULL; | |
| return (err); | |
| - if (ra->next_drr->drr_type == DRR_BEGIN) | |
| + } | |
| + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { | |
| + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); | |
| + ra->next_rrd = NULL; | |
| return (SET_ERROR(EINVAL)); | |
| + } | |
| /* | |
| * Note: checksum is of everything up to but not including the | |
| @@ -1840,107 +2020,180 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) | |
| */ | |
| ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), | |
| ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); | |
| - restore_cksum(ra, | |
| + receive_cksum(ra, | |
| offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), | |
| - ra->next_drr); | |
| + &ra->next_rrd->header); | |
| - zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; | |
| - zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; | |
| + zio_cksum_t cksum_orig = | |
| + ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; | |
| + zio_cksum_t *cksump = | |
| + &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; | |
| if (ra->byteswap) | |
| - byteswap_record(ra->next_drr); | |
| + byteswap_record(&ra->next_rrd->header); | |
| if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && | |
| - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) | |
| + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { | |
| + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); | |
| + ra->next_rrd = NULL; | |
| return (SET_ERROR(ECKSUM)); | |
| + } | |
| - restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); | |
| + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); | |
| return (0); | |
| } | |
| +/* | |
| + * Issue the prefetch reads for any necessary indirect blocks. | |
| + * | |
| + * We use the object ignore list to tell us whether or not to issue prefetches | |
| + * for a given object. We do this for both correctness (in case the blocksize | |
| + * of an object has changed) and performance (if the object doesn't exist, don't | |
| + * needlessly try to issue prefetches). We also trim the list as we go through | |
| + * the stream to prevent it from growing to an unbounded size. | |
| + * | |
| + * The object numbers within will always be in sorted order, and any write | |
| + * records we see will also be in sorted order, but they're not sorted with | |
| + * respect to each other (i.e. we can get several object records before | |
| + * receiving each object's write records). As a result, once we've reached a | |
| + * given object number, we can safely remove any reference to lower object | |
| + * numbers in the ignore list. In practice, we receive up to 32 object records | |
| + * before receiving write records, so the list can have up to 32 nodes in it. | |
| + */ | |
| +/* ARGSUSED */ | |
| +static void | |
| +receive_read_prefetch(struct receive_arg *ra, | |
| + uint64_t object, uint64_t offset, uint64_t length) | |
| +{ | |
| + struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); | |
| + while (node != NULL && node->object < object) { | |
| + VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); | |
| + kmem_free(node, sizeof (*node)); | |
| + node = list_head(&ra->ignore_obj_list); | |
| + } | |
| + if (node == NULL || node->object > object) { | |
| + dmu_prefetch(ra->os, object, 1, offset, length, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| + } | |
| +} | |
| + | |
| +/* | |
| + * Read records off the stream, issuing any necessary prefetches. | |
| + */ | |
| static int | |
| -restore_process_record(struct restorearg *ra) | |
| +receive_read_record(struct receive_arg *ra) | |
| { | |
| int err; | |
| - switch (ra->drr->drr_type) { | |
| + switch (ra->rrd->header.drr_type) { | |
| case DRR_OBJECT: | |
| { | |
| - struct drr_object *drro = &ra->drr->drr_u.drr_object; | |
| - err = restore_read_payload_and_next_header(ra, | |
| - P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); | |
| - if (err != 0) | |
| + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; | |
| + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); | |
| + void *buf = kmem_zalloc(size, KM_SLEEP); | |
| + dmu_object_info_t doi; | |
| + err = receive_read_payload_and_next_header(ra, size, buf); | |
| + if (err != 0) { | |
| + kmem_free(buf, size); | |
| return (err); | |
| - return (restore_object(ra, drro, ra->buf)); | |
| + } | |
| + err = dmu_object_info(ra->os, drro->drr_object, &doi); | |
| + /* | |
| + * See receive_read_prefetch for an explanation why we're | |
| + * storing this object in the ignore_obj_list. | |
| + */ | |
| + if (err == ENOENT || | |
| + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { | |
| + struct receive_ign_obj_node *node = | |
| + kmem_zalloc(sizeof (*node), | |
| + KM_SLEEP); | |
| + node->object = drro->drr_object; | |
| +#ifdef ZFS_DEBUG | |
| + struct receive_ign_obj_node *last_object = | |
| + list_tail(&ra->ignore_obj_list); | |
| + uint64_t last_objnum = (last_object != NULL ? | |
| + last_object->object : 0); | |
| + ASSERT3U(node->object, >, last_objnum); | |
| +#endif | |
| + list_insert_tail(&ra->ignore_obj_list, node); | |
| + err = 0; | |
| + } | |
| + return (err); | |
| } | |
| case DRR_FREEOBJECTS: | |
| { | |
| - struct drr_freeobjects *drrfo = | |
| - &ra->drr->drr_u.drr_freeobjects; | |
| - err = restore_read_payload_and_next_header(ra, 0, NULL); | |
| - if (err != 0) | |
| - return (err); | |
| - return (restore_freeobjects(ra, drrfo)); | |
| + err = receive_read_payload_and_next_header(ra, 0, NULL); | |
| + return (err); | |
| } | |
| case DRR_WRITE: | |
| { | |
| - struct drr_write *drrw = &ra->drr->drr_u.drr_write; | |
| + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; | |
| arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), | |
| drrw->drr_length); | |
| - err = restore_read_payload_and_next_header(ra, | |
| + err = receive_read_payload_and_next_header(ra, | |
| drrw->drr_length, abuf->b_data); | |
| - if (err != 0) | |
| - return (err); | |
| - err = restore_write(ra, drrw, abuf); | |
| - /* if restore_write() is successful, it consumes the arc_buf */ | |
| - if (err != 0) | |
| + if (err != 0) { | |
| dmu_return_arcbuf(abuf); | |
| + return (err); | |
| + } | |
| + ra->rrd->write_buf = abuf; | |
| + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, | |
| + drrw->drr_length); | |
| return (err); | |
| } | |
| case DRR_WRITE_BYREF: | |
| { | |
| - struct drr_write_byref *drrwbr = | |
| - &ra->drr->drr_u.drr_write_byref; | |
| - err = restore_read_payload_and_next_header(ra, 0, NULL); | |
| - if (err != 0) | |
| - return (err); | |
| - return (restore_write_byref(ra, drrwbr)); | |
| + struct drr_write_byref *drrwb = | |
| + &ra->rrd->header.drr_u.drr_write_byref; | |
| + err = receive_read_payload_and_next_header(ra, 0, NULL); | |
| + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, | |
| + drrwb->drr_length); | |
| + return (err); | |
| } | |
| case DRR_WRITE_EMBEDDED: | |
| { | |
| struct drr_write_embedded *drrwe = | |
| - &ra->drr->drr_u.drr_write_embedded; | |
| - err = restore_read_payload_and_next_header(ra, | |
| - P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); | |
| - if (err != 0) | |
| + &ra->rrd->header.drr_u.drr_write_embedded; | |
| + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); | |
| + void *buf = kmem_zalloc(size, KM_SLEEP); | |
| + | |
| + err = receive_read_payload_and_next_header(ra, size, buf); | |
| + if (err != 0) { | |
| + kmem_free(buf, size); | |
| return (err); | |
| - return (restore_write_embedded(ra, drrwe, ra->buf)); | |
| + } | |
| + | |
| + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, | |
| + drrwe->drr_length); | |
| + return (err); | |
| } | |
| case DRR_FREE: | |
| { | |
| - struct drr_free *drrf = &ra->drr->drr_u.drr_free; | |
| - err = restore_read_payload_and_next_header(ra, 0, NULL); | |
| - if (err != 0) | |
| - return (err); | |
| - return (restore_free(ra, drrf)); | |
| + /* | |
| + * It might be beneficial to prefetch indirect blocks here, but | |
| + * we don't really have the data to decide for sure. | |
| + */ | |
| + err = receive_read_payload_and_next_header(ra, 0, NULL); | |
| + return (err); | |
| } | |
| case DRR_END: | |
| { | |
| - struct drr_end *drre = &ra->drr->drr_u.drr_end; | |
| + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; | |
| if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) | |
| return (SET_ERROR(EINVAL)); | |
| return (0); | |
| } | |
| case DRR_SPILL: | |
| { | |
| - struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; | |
| - err = restore_read_payload_and_next_header(ra, | |
| - drrs->drr_length, ra->buf); | |
| + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; | |
| + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); | |
| + err = receive_read_payload_and_next_header(ra, drrs->drr_length, | |
| + buf); | |
| if (err != 0) | |
| - return (err); | |
| - return (restore_spill(ra, drrs, ra->buf)); | |
| + kmem_free(buf, drrs->drr_length); | |
| + return (err); | |
| } | |
| default: | |
| return (SET_ERROR(EINVAL)); | |
| @@ -1948,6 +2201,118 @@ restore_process_record(struct restorearg *ra) | |
| } | |
| /* | |
| + * Commit the records to the pool. | |
| + */ | |
| +static int | |
| +receive_process_record(struct receive_writer_arg *rwa, | |
| + struct receive_record_arg *rrd) | |
| +{ | |
| + int err; | |
| + | |
| + switch (rrd->header.drr_type) { | |
| + case DRR_OBJECT: | |
| + { | |
| + struct drr_object *drro = &rrd->header.drr_u.drr_object; | |
| + err = receive_object(rwa, drro, rrd->payload); | |
| + kmem_free(rrd->payload, rrd->payload_size); | |
| + rrd->payload = NULL; | |
| + return (err); | |
| + } | |
| + case DRR_FREEOBJECTS: | |
| + { | |
| + struct drr_freeobjects *drrfo = | |
| + &rrd->header.drr_u.drr_freeobjects; | |
| + return (receive_freeobjects(rwa, drrfo)); | |
| + } | |
| + case DRR_WRITE: | |
| + { | |
| + struct drr_write *drrw = &rrd->header.drr_u.drr_write; | |
| + err = receive_write(rwa, drrw, rrd->write_buf); | |
| + /* if receive_write() is successful, it consumes the arc_buf */ | |
| + if (err != 0) | |
| + dmu_return_arcbuf(rrd->write_buf); | |
| + rrd->write_buf = NULL; | |
| + rrd->payload = NULL; | |
| + return (err); | |
| + } | |
| + case DRR_WRITE_BYREF: | |
| + { | |
| + struct drr_write_byref *drrwbr = | |
| + &rrd->header.drr_u.drr_write_byref; | |
| + return (receive_write_byref(rwa, drrwbr)); | |
| + } | |
| + case DRR_WRITE_EMBEDDED: | |
| + { | |
| + struct drr_write_embedded *drrwe = | |
| + &rrd->header.drr_u.drr_write_embedded; | |
| + err = receive_write_embedded(rwa, drrwe, rrd->payload); | |
| + kmem_free(rrd->payload, rrd->payload_size); | |
| + rrd->payload = NULL; | |
| + return (err); | |
| + } | |
| + case DRR_FREE: | |
| + { | |
| + struct drr_free *drrf = &rrd->header.drr_u.drr_free; | |
| + return (receive_free(rwa, drrf)); | |
| + } | |
| + case DRR_SPILL: | |
| + { | |
| + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; | |
| + err = receive_spill(rwa, drrs, rrd->payload); | |
| + kmem_free(rrd->payload, rrd->payload_size); | |
| + rrd->payload = NULL; | |
| + return (err); | |
| + } | |
| + default: | |
| + return (SET_ERROR(EINVAL)); | |
| + } | |
| +} | |
| + | |
| +/* | |
| + * dmu_recv_stream's worker thread; pull records off the queue, and then call | |
| + * receive_process_record When we're done, signal the main thread and exit. | |
| + */ | |
| +static void | |
| +receive_writer_thread(void *arg) | |
| +{ | |
| + struct receive_writer_arg *rwa = arg; | |
| + struct receive_record_arg *rrd; | |
| + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; | |
| + rrd = bqueue_dequeue(&rwa->q)) { | |
| + /* | |
| + * If there's an error, the main thread will stop putting things | |
| + * on the queue, but we need to clear everything in it before we | |
| + * can exit. | |
| + */ | |
| + if (rwa->err == 0) { | |
| + rwa->err = receive_process_record(rwa, rrd); | |
| + } else if (rrd->write_buf != NULL) { | |
| + dmu_return_arcbuf(rrd->write_buf); | |
| + rrd->write_buf = NULL; | |
| + rrd->payload = NULL; | |
| + } else if (rrd->payload != NULL) { | |
| + kmem_free(rrd->payload, rrd->payload_size); | |
| + rrd->payload = NULL; | |
| + } | |
| + kmem_free(rrd, sizeof (*rrd)); | |
| + } | |
| + kmem_free(rrd, sizeof (*rrd)); | |
| + mutex_enter(&rwa->mutex); | |
| + rwa->done = B_TRUE; | |
| + cv_signal(&rwa->cv); | |
| + mutex_exit(&rwa->mutex); | |
| +} | |
| + | |
| +/* | |
| + * Read in the stream's records, one by one, and apply them to the pool. There | |
| + * are two threads involved; the thread that calls this function will spin up a | |
| + * worker thread, read the records off the stream one by one, and issue | |
| + * prefetches for any necessary indirect blocks. It will then push the records | |
| + * onto an internal blocking queue. The worker thread will pull the records off | |
| + * the queue, and actually write the data into the DMU. This way, the worker | |
| + * thread doesn't have to wait for reads to complete, since everything it needs | |
| + * (the indirect blocks) will be prefetched. | |
| + * | |
| * NB: callers *must* call dmu_recv_end() if this succeeds. | |
| */ | |
| int | |
| @@ -1955,17 +2320,16 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, | |
| int cleanup_fd, uint64_t *action_handlep) | |
| { | |
| int err = 0; | |
| - struct restorearg ra = { 0 }; | |
| + struct receive_arg ra = { 0 }; | |
| + struct receive_writer_arg rwa = { 0 }; | |
| int featureflags; | |
| ra.byteswap = drc->drc_byteswap; | |
| ra.cksum = drc->drc_cksum; | |
| ra.vp = vp; | |
| ra.voff = *voffp; | |
| - ra.bufsize = SPA_MAXBLOCKSIZE; | |
| - ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); | |
| - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); | |
| - ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); | |
| + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), | |
| + offsetof(struct receive_ign_obj_node, node)); | |
| /* these were verified in dmu_recv_begin */ | |
| ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, | |
| @@ -1996,48 +2360,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, | |
| } | |
| if (*action_handlep == 0) { | |
| - ra.guid_to_ds_map = | |
| + rwa.guid_to_ds_map = | |
| kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); | |
| - avl_create(ra.guid_to_ds_map, guid_compare, | |
| + avl_create(rwa.guid_to_ds_map, guid_compare, | |
| sizeof (guid_map_entry_t), | |
| offsetof(guid_map_entry_t, avlnode)); | |
| err = zfs_onexit_add_cb(minor, | |
| - free_guid_map_onexit, ra.guid_to_ds_map, | |
| + free_guid_map_onexit, rwa.guid_to_ds_map, | |
| action_handlep); | |
| if (ra.err != 0) | |
| goto out; | |
| } else { | |
| err = zfs_onexit_cb_data(minor, *action_handlep, | |
| - (void **)&ra.guid_to_ds_map); | |
| + (void **)&rwa.guid_to_ds_map); | |
| if (ra.err != 0) | |
| goto out; | |
| } | |
| - drc->drc_guid_to_ds_map = ra.guid_to_ds_map; | |
| + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; | |
| } | |
| - err = restore_read_payload_and_next_header(&ra, 0, NULL); | |
| - if (err != 0) | |
| + err = receive_read_payload_and_next_header(&ra, 0, NULL); | |
| + if (err) | |
| goto out; | |
| - for (;;) { | |
| - void *tmp; | |
| + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, | |
| + offsetof(struct receive_record_arg, node)); | |
| + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); | |
| + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); | |
| + rwa.os = ra.os; | |
| + rwa.byteswap = drc->drc_byteswap; | |
| + | |
| + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, | |
| + TS_RUN, minclsyspri); | |
| + /* | |
| + * We're reading rwa.err without locks, which is safe since we are the | |
| + * only reader, and the worker thread is the only writer. It's ok if we | |
| + * miss a write for an iteration or two of the loop, since the writer | |
| + * thread will keep freeing records we send it until we send it an eos | |
| + * marker. | |
| + * | |
| + * We can leave this loop in 3 ways: First, if rwa.err is | |
| + * non-zero. In that case, the writer thread will free the rrd we just | |
| + * pushed. Second, if we're interrupted; in that case, either it's the | |
| + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd | |
| + * has been handed off to the writer thread who will free it. Finally, | |
| + * if receive_read_record fails or we're at the end of the stream, then | |
| + * we free ra.rrd and exit. | |
| + */ | |
| + while (rwa.err == 0) { | |
| if (issig(JUSTLOOKING) && issig(FORREAL)) { | |
| err = SET_ERROR(EINTR); | |
| break; | |
| } | |
| - tmp = ra.next_drr; | |
| - ra.next_drr = ra.drr; | |
| - ra.drr = tmp; | |
| + ASSERT3P(ra.rrd, ==, NULL); | |
| + ra.rrd = ra.next_rrd; | |
| + ra.next_rrd = NULL; | |
| + /* Allocates and loads header into ra.next_rrd */ | |
| + err = receive_read_record(&ra); | |
| - /* process ra.drr, read in ra.next_drr */ | |
| - err = restore_process_record(&ra); | |
| - if (err != 0) | |
| - break; | |
| - if (ra.drr->drr_type == DRR_END) | |
| + if (ra.rrd->header.drr_type == DRR_END || err != 0) { | |
| + kmem_free(ra.rrd, sizeof (*ra.rrd)); | |
| + ra.rrd = NULL; | |
| break; | |
| + } | |
| + | |
| + bqueue_enqueue(&rwa.q, ra.rrd, | |
| + sizeof (struct receive_record_arg) + ra.rrd->payload_size); | |
| + ra.rrd = NULL; | |
| + } | |
| + if (ra.next_rrd == NULL) | |
| + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); | |
| + ra.next_rrd->eos_marker = B_TRUE; | |
| + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); | |
| + | |
| + mutex_enter(&rwa.mutex); | |
| + while (!rwa.done) { | |
| + cv_wait(&rwa.cv, &rwa.mutex); | |
| } | |
| + mutex_exit(&rwa.mutex); | |
| + | |
| + cv_destroy(&rwa.cv); | |
| + mutex_destroy(&rwa.mutex); | |
| + bqueue_destroy(&rwa.q); | |
| + if (err == 0) | |
| + err = rwa.err; | |
| out: | |
| if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) | |
| @@ -2051,10 +2459,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, | |
| dmu_recv_cleanup_ds(drc); | |
| } | |
| - kmem_free(ra.drr, sizeof (*ra.drr)); | |
| - kmem_free(ra.buf, ra.bufsize); | |
| - kmem_free(ra.next_drr, sizeof (*ra.next_drr)); | |
| *voffp = ra.voff; | |
| + for (struct receive_ign_obj_node *n = | |
| + list_remove_head(&ra.ignore_obj_list); n != NULL; | |
| + n = list_remove_head(&ra.ignore_obj_list)) { | |
| + kmem_free(n, sizeof (*n)); | |
| + } | |
| + list_destroy(&ra.ignore_obj_list); | |
| return (err); | |
| } | |
| diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c | |
| index 8e49d28..9222b54 100644 | |
| --- a/module/zfs/dmu_traverse.c | |
| +++ b/module/zfs/dmu_traverse.c | |
| @@ -157,7 +157,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, | |
| * If we already visited this bp & everything below, | |
| * don't bother doing it again. | |
| */ | |
| - if (zbookmark_is_before(dnp, zb, td->td_resume)) | |
| + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) | |
| return (RESUME_SKIP_ALL); | |
| /* | |
| @@ -422,6 +422,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, | |
| int j, err = 0; | |
| zbookmark_phys_t czb; | |
| + if (td->td_flags & TRAVERSE_PRE) { | |
| + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, | |
| + ZB_DNODE_BLKID); | |
| + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, | |
| + td->td_arg); | |
| + if (err == TRAVERSE_VISIT_NO_CHILDREN) | |
| + return (0); | |
| + if (err != 0) | |
| + return (err); | |
| + } | |
| + | |
| for (j = 0; j < dnp->dn_nblkptr; j++) { | |
| SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); | |
| err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); | |
| @@ -429,10 +440,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, | |
| break; | |
| } | |
| - if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { | |
| + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { | |
| SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); | |
| err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); | |
| } | |
| + | |
| + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { | |
| + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, | |
| + ZB_DNODE_BLKID); | |
| + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, | |
| + td->td_arg); | |
| + if (err == TRAVERSE_VISIT_NO_CHILDREN) | |
| + return (0); | |
| + if (err != 0) | |
| + return (err); | |
| + } | |
| return (err); | |
| } | |
| @@ -445,6 +467,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | |
| ASSERT(pfd->pd_bytes_fetched >= 0); | |
| + if (bp == NULL) | |
| + return (0); | |
| if (pfd->pd_cancel) | |
| return (SET_ERROR(EINTR)); | |
| diff --git a/umodule/zfs/dmu_tx.c b/module/zfs/dmu_tx.c | |
| index 6007815..50885ac 100644 | |
| --- a/module/zfs/dmu_tx.c | |
| +++ b/module/zfs/dmu_tx.c | |
| @@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
| dmu_buf_impl_t *db; | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| - err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); | |
| + err = dbuf_hold_impl(dn, 0, start, | |
| + FALSE, FALSE, FTAG, &db); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| if (err) { | |
| @@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
| blkoff = P2PHASE(blkid, epb); | |
| tochk = MIN(epb - blkoff, nblks); | |
| - err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); | |
| + err = dbuf_hold_impl(dn, 1, blkid >> epbs, | |
| + FALSE, FALSE, FTAG, &dbuf); | |
| if (err) { | |
| txh->txh_tx->tx_err = err; | |
| break; | |
| diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c | |
| index 3cc3d67..70930ae 100644 | |
| --- a/module/zfs/dmu_zfetch.c | |
| +++ b/module/zfs/dmu_zfetch.c | |
| @@ -291,7 +291,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) | |
| fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); | |
| for (i = 0; i < fetchsz; i++) { | |
| - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); | |
| + dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ, | |
| + ARC_FLAG_PREFETCH); | |
| } | |
| return (fetchsz); | |
| diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c | |
| index 397cc5c..17edd95 100644 | |
| --- a/module/zfs/dnode.c | |
| +++ b/module/zfs/dnode.c | |
| @@ -1111,7 +1111,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, | |
| drop_struct_lock = TRUE; | |
| } | |
| - blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); | |
| + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); | |
| db = dbuf_hold(mdn, blk, FTAG); | |
| if (drop_struct_lock) | |
| @@ -1408,7 +1408,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) | |
| goto fail; | |
| /* resize the old block */ | |
| - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); | |
| + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); | |
| if (err == 0) | |
| dbuf_new_size(db, size, tx); | |
| else if (err != ENOENT) | |
| @@ -1581,8 +1581,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) | |
| ASSERT3U(blkoff + head, ==, blksz); | |
| if (len < head) | |
| head = len; | |
| - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, | |
| - FTAG, &db) == 0) { | |
| + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), | |
| + TRUE, FALSE, FTAG, &db) == 0) { | |
| caddr_t data; | |
| /* don't dirty if it isn't on disk and isn't dirty */ | |
| @@ -1619,8 +1619,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) | |
| if (tail) { | |
| if (len < tail) | |
| tail = len; | |
| - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), | |
| - TRUE, FTAG, &db) == 0) { | |
| + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), | |
| + TRUE, FALSE, FTAG, &db) == 0) { | |
| /* don't dirty if not on disk and not dirty */ | |
| if (db->db_last_dirty || | |
| (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { | |
| @@ -1849,7 +1849,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) | |
| */ | |
| static int | |
| dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |
| - int lvl, uint64_t blkfill, uint64_t txg) | |
| + int lvl, uint64_t blkfill, uint64_t txg) | |
| { | |
| dmu_buf_impl_t *db = NULL; | |
| void *data = NULL; | |
| @@ -1871,8 +1871,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |
| epb = dn->dn_phys->dn_nblkptr; | |
| data = dn->dn_phys->dn_blkptr; | |
| } else { | |
| - uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); | |
| - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); | |
| + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); | |
| + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); | |
| if (error) { | |
| if (error != ENOENT) | |
| return (error); | |
| diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c | |
| index 0633604..0787885 100644 | |
| --- a/module/zfs/dnode_sync.c | |
| +++ b/module/zfs/dnode_sync.c | |
| @@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| err = dbuf_hold_impl(dn, db->db_level-1, | |
| - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); | |
| + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| if (err == ENOENT) | |
| continue; | |
| @@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, | |
| continue; | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, | |
| - i, B_TRUE, FTAG, &subdb)); | |
| + i, TRUE, FALSE, FTAG, &subdb)); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| ASSERT3P(bp, ==, subdb->db_blkptr); | |
| @@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, | |
| continue; | |
| rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
| VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, | |
| - TRUE, FTAG, &db)); | |
| + TRUE, FALSE, FTAG, &db)); | |
| rw_exit(&dn->dn_struct_rwlock); | |
| free_children(db, blkid, nblks, tx); | |
| diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c | |
| index a11926b..8f419d6 100644 | |
| --- a/module/zfs/dsl_dataset.c | |
| +++ b/module/zfs/dsl_dataset.c | |
| @@ -530,6 +530,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, | |
| const char *snapname; | |
| uint64_t obj; | |
| int err = 0; | |
| + dsl_dataset_t *ds; | |
| err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); | |
| if (err != 0) | |
| @@ -538,36 +539,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, | |
| ASSERT(dsl_pool_config_held(dp)); | |
| obj = dsl_dir_phys(dd)->dd_head_dataset_obj; | |
| if (obj != 0) | |
| - err = dsl_dataset_hold_obj(dp, obj, tag, dsp); | |
| + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); | |
| else | |
| err = SET_ERROR(ENOENT); | |
| /* we may be looking for a snapshot */ | |
| if (err == 0 && snapname != NULL) { | |
| - dsl_dataset_t *ds; | |
| + dsl_dataset_t *snap_ds; | |
| if (*snapname++ != '@') { | |
| - dsl_dataset_rele(*dsp, tag); | |
| + dsl_dataset_rele(ds, tag); | |
| dsl_dir_rele(dd, FTAG); | |
| return (SET_ERROR(ENOENT)); | |
| } | |
| dprintf("looking for snapshot '%s'\n", snapname); | |
| - err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); | |
| + err = dsl_dataset_snap_lookup(ds, snapname, &obj); | |
| if (err == 0) | |
| - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); | |
| - dsl_dataset_rele(*dsp, tag); | |
| + err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); | |
| + dsl_dataset_rele(ds, tag); | |
| if (err == 0) { | |
| - mutex_enter(&ds->ds_lock); | |
| - if (ds->ds_snapname[0] == 0) | |
| - (void) strlcpy(ds->ds_snapname, snapname, | |
| - sizeof (ds->ds_snapname)); | |
| - mutex_exit(&ds->ds_lock); | |
| - *dsp = ds; | |
| + mutex_enter(&snap_ds->ds_lock); | |
| + if (snap_ds->ds_snapname[0] == 0) | |
| + (void) strlcpy(snap_ds->ds_snapname, snapname, | |
| + sizeof (snap_ds->ds_snapname)); | |
| + mutex_exit(&snap_ds->ds_lock); | |
| + ds = snap_ds; | |
| } | |
| } | |
| - | |
| + if (err == 0) | |
| + *dsp = ds; | |
| dsl_dir_rele(dd, FTAG); | |
| return (err); | |
| } | |
| diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c | |
| index 7f90469..40e34cb 100644 | |
| --- a/module/zfs/dsl_destroy.c | |
| +++ b/module/zfs/dsl_destroy.c | |
| @@ -552,7 +552,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| struct killarg *ka = arg; | |
| dmu_tx_t *tx = ka->tx; | |
| - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| return (0); | |
| if (zb->zb_level == ZB_ZIL_LEVEL) { | |
| diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c | |
| index 0894ebd..ba397b1 100644 | |
| --- a/module/zfs/dsl_scan.c | |
| +++ b/module/zfs/dsl_scan.c | |
| @@ -575,7 +575,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, | |
| * If we already visited this bp & everything below (in | |
| * a prior txg sync), don't bother doing it again. | |
| */ | |
| - if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) | |
| + if (zbookmark_subtree_completed(dnp, zb, | |
| + &scn->scn_phys.scn_bookmark)) | |
| return (B_TRUE); | |
| /* | |
| diff --git a/module/zfs/spa.c b/module/zfs/spa.c | |
| index e9d2432..5e664d7 100644 | |
| --- a/module/zfs/spa.c | |
| +++ b/module/zfs/spa.c | |
| @@ -1874,7 +1874,7 @@ static int | |
| spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) | |
| { | |
| - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | |
| return (0); | |
| /* | |
| * Note: normally this routine will not be called if | |
| diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c | |
| index 94790b9..e0fe6ac 100644 | |
| --- a/module/zfs/space_map.c | |
| +++ b/module/zfs/space_map.c | |
| @@ -76,8 +76,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) | |
| mutex_exit(sm->sm_lock); | |
| if (end > bufsize) { | |
| - dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, | |
| - end - bufsize); | |
| + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, | |
| + end - bufsize, ZIO_PRIORITY_SYNC_READ); | |
| } | |
| mutex_enter(sm->sm_lock); | |
| diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h | |
| new file mode 100644 | |
| index 0000000..63722df | |
| --- /dev/null | |
| +++ b/include/sys/bqueue.h | |
| @@ -0,0 +1,54 @@ | |
| +/* | |
| + * CDDL HEADER START | |
| + * | |
| + * This file and its contents are supplied under the terms of the | |
| + * Common Development and Distribution License ("CDDL"), version 1.0. | |
| + * You may only use this file in accordance with the terms of version | |
| + * 1.0 of the CDDL. | |
| + * | |
| + * A full copy of the text of the CDDL should have accompanied this | |
| + * source. A copy of the CDDL is also available via the Internet at | |
| + * http://www.illumos.org/license/CDDL. | |
| + * | |
| + * CDDL HEADER END | |
| + */ | |
| +/* | |
| + * Copyright (c) 2014 by Delphix. All rights reserved. | |
| + */ | |
| + | |
| +#ifndef _BQUEUE_H | |
| +#define _BQUEUE_H | |
| + | |
| +#ifdef __cplusplus | |
| +extern "C" { | |
| +#endif | |
| + | |
| +#include <sys/zfs_context.h> | |
| + | |
| +typedef struct bqueue { | |
| + list_t bq_list; | |
| + kmutex_t bq_lock; | |
| + kcondvar_t bq_add_cv; | |
| + kcondvar_t bq_pop_cv; | |
| + uint64_t bq_size; | |
| + uint64_t bq_maxsize; | |
| + size_t bq_node_offset; | |
| +} bqueue_t; | |
| + | |
| +typedef struct bqueue_node { | |
| + list_node_t bqn_node; | |
| + uint64_t bqn_size; | |
| +} bqueue_node_t; | |
| + | |
| + | |
| +int bqueue_init(bqueue_t *, uint64_t, size_t); | |
| +void bqueue_destroy(bqueue_t *); | |
| +void bqueue_enqueue(bqueue_t *, void *, uint64_t); | |
| +void *bqueue_dequeue(bqueue_t *); | |
| +boolean_t bqueue_empty(bqueue_t *); | |
| + | |
| +#ifdef __cplusplus | |
| +} | |
| +#endif | |
| + | |
| +#endif /* _BQUEUE_H */ | |
| diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h | |
| index 2e07185..482ccb0 100644 | |
| --- a/include/sys/dbuf.h | |
| +++ b/include/sys/dbuf.h | |
| @@ -245,8 +245,7 @@ typedef struct dbuf_hash_table { | |
| kmutex_t hash_mutexes[DBUF_MUTEXES]; | |
| } dbuf_hash_table_t; | |
| - | |
| -uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); | |
| +uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); | |
| dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); | |
| void dbuf_create_bonus(struct dnode *dn); | |
| @@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); | |
| dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); | |
| dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, | |
| void *tag); | |
| -int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, | |
| +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, | |
| + boolean_t fail_sparse, boolean_t fail_uncached, | |
| void *tag, dmu_buf_impl_t **dbp); | |
| -void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); | |
| +void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, | |
| + zio_priority_t prio, arc_flags_t aflags); | |
| void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); | |
| boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, | |
| diff --git a/include/sys/dmu.h b/include/sys/dmu.h | |
| index 9864129..29386be 100644 | |
| --- a/include/sys/dmu.h | |
| +++ b/include/sys/dmu.h | |
| @@ -46,6 +46,7 @@ | |
| #include <sys/inttypes.h> | |
| #include <sys/cred.h> | |
| #include <sys/fs/zfs.h> | |
| +#include <sys/zio_priority.h> | |
| #ifdef __cplusplus | |
| extern "C" { | |
| @@ -739,8 +740,8 @@ extern int zfs_max_recordsize; | |
| /* | |
| * Asynchronously try to read in the data. | |
| */ | |
| -void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, | |
| - uint64_t len); | |
| +void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, | |
| + uint64_t len, enum zio_priority pri); | |
| typedef struct dmu_object_info { | |
| /* All sizes are in bytes unless otherwise indicated. */ | |
| diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h | |
| index 7d490ec..81757b5 100644 | |
| --- a/include/sys/dsl_dataset.h | |
| +++ b/include/sys/dsl_dataset.h | |
| @@ -20,7 +20,7 @@ | |
| */ | |
| /* | |
| * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
| - * Copyright (c) 2013 by Delphix. All rights reserved. | |
| + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. | |
| * Copyright (c) 2013, Joyent, Inc. All rights reserved. | |
| * Copyright (c) 2013 Steven Hartland. All rights reserved. | |
| * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | |
| diff --git a/include/sys/zio.h b/include/sys/zio.h | |
| index 198dc92..d160c94 100644 | |
| --- a/include/sys/zio.h | |
| +++ b/include/sys/zio.h | |
| @@ -30,6 +30,7 @@ | |
| #ifndef _ZIO_H | |
| #define _ZIO_H | |
| +#include <sys/zio_priority.h> | |
| #include <sys/zfs_context.h> | |
| #include <sys/spa.h> | |
| #include <sys/txg.h> | |
| @@ -144,17 +145,6 @@ enum zio_compress { | |
| #define ZIO_FAILURE_MODE_CONTINUE 1 | |
| #define ZIO_FAILURE_MODE_PANIC 2 | |
| -typedef enum zio_priority { | |
| - ZIO_PRIORITY_SYNC_READ, | |
| - ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ | |
| - ZIO_PRIORITY_ASYNC_READ, /* prefetch */ | |
| - ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ | |
| - ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ | |
| - ZIO_PRIORITY_NUM_QUEUEABLE, | |
| - | |
| - ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ | |
| -} zio_priority_t; | |
| - | |
| enum zio_flag { | |
| /* | |
| * Flags inherited by gang, ddt, and vdev children, | |
| @@ -258,6 +248,7 @@ extern const char *zio_type_name[ZIO_TYPES]; | |
| * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. | |
| * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. | |
| * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. | |
| + * dnode visit bookmarks are <objset, object id of dnode, -3, 0>. | |
| * | |
| * Note: this structure is called a bookmark because its original purpose | |
| * was to remember where to resume a pool-wide traverse. | |
| @@ -290,6 +281,9 @@ typedef struct zbookmark_phys { | |
| #define ZB_ZIL_OBJECT (0ULL) | |
| #define ZB_ZIL_LEVEL (-2LL) | |
| +#define ZB_DNODE_LEVEL (-3LL) | |
| +#define ZB_DNODE_BLKID (0ULL) | |
| + | |
| #define ZB_IS_ZERO(zb) \ | |
| ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ | |
| (zb)->zb_level == 0 && (zb)->zb_blkid == 0) | |
| @@ -592,8 +586,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, | |
| extern void spa_handle_ignored_writes(spa_t *spa); | |
| /* zbookmark_phys functions */ | |
| -boolean_t zbookmark_is_before(const struct dnode_phys *dnp, | |
| - const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); | |
| +boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, | |
| + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); | |
| +int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, | |
| + uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); | |
| #ifdef __cplusplus | |
| } | |
| diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h | |
| index a921a2f..0c293ab 100644 | |
| --- a/include/sys/zio_checksum.h | |
| +++ b/include/sys/zio_checksum.h | |
| @@ -44,7 +44,7 @@ typedef struct zio_checksum_info { | |
| zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ | |
| int ci_correctable; /* number of correctable bits */ | |
| int ci_eck; /* uses zio embedded checksum? */ | |
| - int ci_dedup; /* strong enough for dedup? */ | |
| + boolean_t ci_dedup; /* strong enough for dedup? */ | |
| char *ci_name; /* descriptive name */ | |
| } zio_checksum_info_t; | |
| diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h | |
| new file mode 100644 | |
| index 0000000..e33b958 | |
| --- /dev/null | |
| +++ b/include/sys/zio_priority.h | |
| @@ -0,0 +1,40 @@ | |
| +/* | |
| + * CDDL HEADER START | |
| + * | |
| + * This file and its contents are supplied under the terms of the | |
| + * Common Development and Distribution License ("CDDL"), version 1.0. | |
| + * You may only use this file in accordance with the terms of version | |
| + * 1.0 of the CDDL. | |
| + * | |
| + * A full copy of the text of the CDDL should have accompanied this | |
| + * source. A copy of the CDDL is also available via the Internet at | |
| + * http://www.illumos.org/license/CDDL. | |
| + * | |
| + * CDDL HEADER END | |
| + */ | |
| +/* | |
| + * Copyright (c) 2014 by Delphix. All rights reserved. | |
| + */ | |
| +#ifndef _ZIO_PRIORITY_H | |
| +#define _ZIO_PRIORITY_H | |
| + | |
| +#ifdef __cplusplus | |
| +extern "C" { | |
| +#endif | |
| + | |
| +typedef enum zio_priority { | |
| + ZIO_PRIORITY_SYNC_READ, | |
| + ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ | |
| + ZIO_PRIORITY_ASYNC_READ, /* prefetch */ | |
| + ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ | |
| + ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ | |
| + ZIO_PRIORITY_NUM_QUEUEABLE, | |
| + | |
| + ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ | |
| +} zio_priority_t; | |
| + | |
| +#ifdef __cplusplus | |
| +} | |
| +#endif | |
| + | |
| +#endif /* _ZIO_PRIORITY_H */ | |
| diff --git a/module/zfs/zap.c b/module/zfs/zap.c | |
| index 20f3496..2ea6b12 100644 | |
| --- a/module/zfs/zap.c | |
| +++ b/module/zfs/zap.c | |
| @@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, | |
| newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); | |
| tbl->zt_nextblk = newblk; | |
| ASSERT0(tbl->zt_blks_copied); | |
| - dmu_prefetch(zap->zap_objset, zap->zap_object, | |
| - tbl->zt_blk << bs, tbl->zt_numblks << bs); | |
| + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, | |
| + tbl->zt_blk << bs, tbl->zt_numblks << bs, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| } | |
| /* | |
| @@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn) | |
| if (zap_idx_to_blk(zap, idx, &blk) != 0) | |
| return; | |
| bs = FZAP_BLOCK_SHIFT(zap); | |
| - dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); | |
| + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| } | |
| /* | |
| @@ -1285,9 +1287,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) | |
| } else { | |
| int b; | |
| - dmu_prefetch(zap->zap_objset, zap->zap_object, | |
| + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, | |
| zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, | |
| - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); | |
| + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; | |
| b++) { | |
| diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c | |
| index 8cf83b3..3559b9b 100644 | |
| --- a/module/zfs/zfs_vfsops.c | |
| +++ b/module/zfs/zfs_vfsops.c | |
| @@ -20,7 +20,7 @@ | |
| */ | |
| /* | |
| * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
| - * Copyright (c) 2013 by Delphix. All rights reserved. | |
| + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. | |
| */ | |
| /* Portions Copyright 2010 Robert Milkowski */ | |
| @@ -949,7 +949,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) | |
| error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, | |
| &sa_obj); | |
| if (error) | |
| - return (error); | |
| + goto out; | |
| } else { | |
| /* | |
| * Pre SA versions file systems should never touch | |
| diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c | |
| index bf75097..fe740a5 100644 | |
| --- a/module/zfs/zfs_vnops.c | |
| +++ b/module/zfs/zfs_vnops.c | |
| @@ -2404,7 +2404,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, | |
| /* Prefetch znode */ | |
| if (prefetch) | |
| - dmu_prefetch(os, objnum, 0, 0); | |
| + dmu_prefetch(os, objnum, 0, 0, 0, | |
| + ZIO_PRIORITY_SYNC_READ); | |
| skip_entry: | |
| /* | |
| diff --git a/module/zfs/zio.c b/module/zfs/zio.c | |
| index aa0f294..f92ecb4 100644 | |
| --- a/module/zfs/zio.c | |
| +++ b/module/zfs/zio.c | |
| @@ -67,6 +67,9 @@ extern vmem_t *zio_alloc_arena; | |
| #define ZIO_PIPELINE_CONTINUE 0x100 | |
| #define ZIO_PIPELINE_STOP 0x101 | |
| +#define BP_SPANB(indblkshift, level) \ | |
| + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) | |
| +#define COMPARE_META_LEVEL 0x80000000ul | |
| /* | |
| * The following actions directly effect the spa's sync-to-convergence logic. | |
| * The values below define the sync pass when we start performing the action. | |
| @@ -3316,37 +3319,127 @@ static zio_pipe_stage_t *zio_pipeline[] = { | |
| zio_done | |
| }; | |
| -/* dnp is the dnode for zb1->zb_object */ | |
| -boolean_t | |
| -zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, | |
| - const zbookmark_phys_t *zb2) | |
| -{ | |
| - uint64_t zb1nextL0, zb2thisobj; | |
| - ASSERT(zb1->zb_objset == zb2->zb_objset); | |
| - ASSERT(zb2->zb_level == 0); | |
| - /* The objset_phys_t isn't before anything. */ | |
| - if (dnp == NULL) | |
| - return (B_FALSE); | |
| - zb1nextL0 = (zb1->zb_blkid + 1) << | |
| - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); | |
| +/* | |
| + * Compare two zbookmark_phys_t's to see which we would reach first in a | |
| + * pre-order traversal of the object tree. | |
| + * | |
| + * This is simple in every case aside from the meta-dnode object. For all other | |
| + * objects, we traverse them in order (object 1 before object 2, and so on). | |
| + * However, all of these objects are traversed while traversing object 0, since | |
| + * the data it points to is the list of objects. Thus, we need to convert to a | |
| + * canonical representation so we can compare meta-dnode bookmarks to | |
| + * non-meta-dnode bookmarks. | |
| + * | |
| + * We do this by calculating "equivalents" for each field of the zbookmark. | |
| + * zbookmarks outside of the meta-dnode use their own object and level, and | |
| + * calculate the level 0 equivalent (the first L0 blkid that is contained in the | |
| + * blocks this bookmark refers to) by multiplying their blkid by their span | |
| + * (the number of L0 blocks contained within one block at their level). | |
| + * zbookmarks inside the meta-dnode calculate their object equivalent | |
| + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use | |
| + * level + 1<<31 (any value larger than a level could ever be) for their level. | |
| + * This causes them to always compare before a bookmark in their object | |
| + * equivalent, compare appropriately to bookmarks in other objects, and to | |
| + * compare appropriately to other bookmarks in the meta-dnode. | |
| + */ | |
| +int | |
| +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, | |
| + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) | |
| +{ | |
| + /* | |
| + * These variables represent the "equivalent" values for the zbookmark, | |
| + * after converting zbookmarks inside the meta dnode to their | |
| + * normal-object equivalents. | |
| + */ | |
| + uint64_t zb1obj, zb2obj; | |
| + uint64_t zb1L0, zb2L0; | |
| + uint64_t zb1level, zb2level; | |
| - zb2thisobj = zb2->zb_object ? zb2->zb_object : | |
| - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); | |
| + if (zb1->zb_object == zb2->zb_object && | |
| + zb1->zb_level == zb2->zb_level && | |
| + zb1->zb_blkid == zb2->zb_blkid) | |
| + return (0); | |
| + | |
| + /* | |
| + * BP_SPANB calculates the span in blocks. | |
| + */ | |
| + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); | |
| + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); | |
| if (zb1->zb_object == DMU_META_DNODE_OBJECT) { | |
| - uint64_t nextobj = zb1nextL0 * | |
| - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; | |
| - return (nextobj <= zb2thisobj); | |
| + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); | |
| + zb1L0 = 0; | |
| + zb1level = zb1->zb_level + COMPARE_META_LEVEL; | |
| + } else { | |
| + zb1obj = zb1->zb_object; | |
| + zb1level = zb1->zb_level; | |
| } | |
| - if (zb1->zb_object < zb2thisobj) | |
| - return (B_TRUE); | |
| - if (zb1->zb_object > zb2thisobj) | |
| - return (B_FALSE); | |
| - if (zb2->zb_object == DMU_META_DNODE_OBJECT) | |
| + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { | |
| + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); | |
| + zb2L0 = 0; | |
| + zb2level = zb2->zb_level + COMPARE_META_LEVEL; | |
| + } else { | |
| + zb2obj = zb2->zb_object; | |
| + zb2level = zb2->zb_level; | |
| + } | |
| + | |
| + /* Now that we have a canonical representation, do the comparison. */ | |
| + if (zb1obj != zb2obj) | |
| + return (zb1obj < zb2obj ? -1 : 1); | |
| + else if (zb1L0 != zb2L0) | |
| + return (zb1L0 < zb2L0 ? -1 : 1); | |
| + else if (zb1level != zb2level) | |
| + return (zb1level > zb2level ? -1 : 1); | |
| + /* | |
| + * This can (theoretically) happen if the bookmarks have the same object | |
| + * and level, but different blkids, if the block sizes are not the same. | |
| + * There is presently no way to change the indirect block sizes | |
| + */ | |
| + return (0); | |
| +} | |
| + | |
| +/* | |
| + * This function checks the following: given that last_block is the place that | |
| + * our traversal stopped last time, does that guarantee that we've visited | |
| + * every node under subtree_root? Therefore, we can't just use the raw output | |
| + * of zbookmark_compare. We have to pass in a modified version of | |
| + * subtree_root; by incrementing the block id, and then checking whether | |
| + * last_block is before or equal to that, we can tell whether or not having | |
| + * visited last_block implies that all of subtree_root's children have been | |
| + * visited. | |
| + */ | |
| +boolean_t | |
| +zbookmark_subtree_completed(const dnode_phys_t *dnp, | |
| + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) | |
| +{ | |
| + zbookmark_phys_t mod_zb = *subtree_root; | |
| + mod_zb.zb_blkid++; | |
| + ASSERT(last_block->zb_level == 0); | |
| + | |
| + /* The objset_phys_t isn't before anything. */ | |
| + if (dnp == NULL) | |
| return (B_FALSE); | |
| - return (zb1nextL0 <= zb2->zb_blkid); | |
| + | |
| + /* | |
| + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the | |
| + * data block size in sectors, because that variable is only used if | |
| + * the bookmark refers to a block in the meta-dnode. Since we don't | |
| + * know without examining it what object it refers to, and there's no | |
| + * harm in passing in this value in other cases, we always pass it in. | |
| + * | |
| + * We pass in 0 for the indirect block size shift because zb2 must be | |
| + * level 0. The indirect block size is only used to calculate the span | |
| + * of the bookmark, but since the bookmark must be level 0, the span is | |
| + * always 1, so the math works out. | |
| + * | |
| + * If you make changes to how the zbookmark_compare code works, be sure | |
| + * to make sure that this code still works afterwards. | |
| + */ | |
| + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, | |
| + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, | |
| + last_block) <= 0); | |
| } | |
| diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c | |
| index 8088810..101566a 100644 | |
| --- a/module/zfs/zvol.c | |
| +++ b/module/zfs/zvol.c | |
| @@ -255,7 +255,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
| zvol_extent_t *ze; | |
| int bs = ma->ma_zv->zv_volblocksize; | |
| - if (BP_IS_HOLE(bp) || | |
| + if (bp == NULL || BP_IS_HOLE(bp) || | |
| zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) | |
| return (0); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment