Created
January 30, 2024 18:23
-
-
Save cbodley/141b3e77af295e300b7f8aa6725bf88f to your computer and use it in GitHub Desktop.
pseudo-design for data change set
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <set> | |
#include <string> | |
#include <vector> | |
#include "common/ceph_time.h" | |
namespace data_changes { | |
// A data change event signals the presence of new entries in a bucket index log | |
// (bilog) shard. Peer zones must process that log shard at some point after | |
// those entries are made visible to RGWOp_BILog_List requests. | |
// | |
// This ordering requirement is critical to providing reliable replication, but | |
// creates a challenge for crash consistency: | |
// * If we write to the datalog before the bilog, peer zones may poll for those | |
// entries before they're visible. | |
// * If we write to the datalog after the bilog, we may crash before writing to | |
// the datalog. | |
// In both cases, peer zones will miss the updates and fail to replicate the | |
// changes. | |
// | |
// To address both, we'll record the datalog event before the bilog entry, but | |
// include an expiration time sufficiently far in the future to cover both. Peer | |
// zones must not retire a data change event until they've processed it after | |
// that expiration time. | |
struct Event { | |
std::string key; | |
ceph::real_time at; | |
}; | |
inline constexpr auto event_expires_after = std::chrono::seconds(60); | |
namespace cls { | |
struct Entry { | |
std::string key; // {bucket-instance-id}:{shard-id}:{gen} | |
// Instead of tracking the set of all unapplied events {t1,t2,...}, use the | |
// half-open interval [at, expiry) as a coarse-grained representation. | |
ceph::real_time at; // timestamp of oldest change not applied | |
ceph::real_time expiry; // timestamp when the entry expires | |
}; | |
struct Index { | |
// Track the set of unapplied events. | |
std::set<Entry, EventKeyLess> entries_by_key; // sort by key for add/trim | |
std::set<Entry, EventAtLess> entries_by_time; // sort by time for list | |
}; | |
// Add a change entry to the index or extend the expiry of an existing one. | |
int add(Index& index, std::string key, | |
ceph::real_time at, ceph::real_time expiry); | |
// Update the timestamps for applied changes, removing when `at >= expiry`. | |
int trim(Index& index, const std::vector<Event>& updates); | |
// Return the list of keys for the oldest N changes. | |
int list(const Index& index, uint32_t max_entries, | |
std::vector<std::string>& changed_keys); | |
} // namespace cls | |
// Client-side Writer interface. | |
class Writer { | |
public: | |
// Write a data change event unless this key has been written recently. | |
int add(std::string key, ceph::real_time at); | |
private: | |
// Because data change events have an expiration time in the future, we can | |
// choose to omit repeated events on the same key unless they would extend | |
// the expiration time by a significant amount (more than some small fraction | |
// of the expiration window). | |
static constexpr auto cache_expires_after = std::chrono::seconds(10); | |
// Track the set of events that are more recent than `cache_expires_after`. | |
std::set<Event, EventKeyLess> entries_by_key; // sort by key for updates | |
std::set<Event, EventAtLess> entries_by_time; // sort by time for expiration | |
}; | |
} // namespace data_changes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment