Last active
February 2, 2024 16:18
-
-
Save dinvlad/a280c5a44165b24960b3442e5205ab30 to your computer and use it in GitHub Desktop.
Retries with exponential backoff and jitter for idempotent background Google Cloud Functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// one can also use Google Cloud Firestore library, | |
// with a slight change in semantics | |
import { firestore } from 'firebase-admin'; | |
import { EventContext, runWith } from 'firebase-functions'; | |
import { promisify } from 'util'; | |
const eventCollection = 'function-events'; | |
enum EventStatus { | |
RUNNING = 'running', | |
FAILED = 'failed', | |
} | |
export const baseRetryDelayMs = 1000; | |
export const retryDelayFactor = 2; | |
export const maxRetryAgeMs = 2 * 60 * 60 * 1000; | |
export const functionTimeoutSec = 60; | |
// Use this wrapper around background functions to | |
// enable automatic retries with exponential backoff and jitter | |
// (https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter). | |
// | |
// The background function **must** be idempotent (safe to retry). | |
// Additionally, you must ensure that the wrapped function | |
// is well-tested, and to manually enable retries for it. | |
// Please see https://cloud.google.com/functions/docs/bestpractices/retries | |
// for more details. | |
// | |
// This solution also prevents double-firing of a function | |
// for the same event, which can happen with background functions, | |
// because they don't guarantee once-only delivery. | |
// | |
// We use Firestore as a transactional store for the events. | |
// It gets cleaned up automatically upon a successful retry, | |
// so it also serves as a journal of permanent failures. | |
export const backgroundFunction = async <T>( | |
event: T, | |
{ eventId, eventType, timestamp: timeCreated }: EventContext, | |
handler: (event: T) => Promise<void> | |
) => { | |
const start = Date.now(); | |
if (start - Date.parse(timeCreated) > maxRetryAgeMs) { | |
return; | |
} | |
const db = firestore(); | |
const ref = db.collection(eventCollection).doc(eventId); | |
const doc = await db.runTransaction(async transaction => { | |
const snapshot = await transaction.get(ref); | |
let attempt: number = snapshot.get('attempt') || 0; | |
let status: EventStatus | undefined = snapshot.get('status'); | |
switch (status) { | |
case undefined: | |
status = EventStatus.RUNNING; | |
transaction.create(ref, { | |
event: JSON.parse(JSON.stringify(event)), | |
eventType, | |
timeCreated, | |
attempt, | |
status, | |
}); | |
break; | |
case EventStatus.FAILED: | |
attempt++; | |
status = EventStatus.RUNNING; | |
transaction.update(ref, { attempt, status }); | |
console.warn(`Retrying '${eventId}' eventId`); | |
break; | |
case EventStatus.RUNNING: | |
console.warn(`Triggered a duplicate '${eventId}' eventId`); | |
return {}; | |
default: | |
console.error( | |
`Unrecognized status '${status}' for '${eventId}' eventId` | |
); | |
return {}; | |
} | |
return { attempt, status }; | |
}); | |
if (doc.status !== EventStatus.RUNNING) { | |
return; | |
} | |
let err: Error | undefined; | |
try { | |
await handler(event); | |
} catch (e) { | |
err = e; | |
} | |
// reliably record error status in Firestore, | |
// or clean it up on success | |
while (true) { | |
try { | |
await db.runTransaction(async transaction => { | |
if (err) { | |
transaction.update(ref, { | |
status: EventStatus.FAILED, | |
reason: err.stack, | |
}); | |
} else { | |
transaction.delete(ref); | |
} | |
}); | |
break; | |
} catch (e) { | |
// ignore transient errors when writing to Firestore | |
console.error(e); | |
} | |
} | |
// optionally, check if err is not transient (e.g. a 400/404) | |
// and return early **without** re-throwing it | |
// (to prevent a retry) | |
// ... | |
// otherwise, if error is transient, retry after a delay | |
if (err) { | |
const retryDelayMs = | |
Math.random() * | |
Math.min( | |
baseRetryDelayMs * Math.pow(retryDelayFactor, doc.attempt), | |
functionTimeoutSec * 1000 - (Date.now() - start) | |
); | |
await promisify(setTimeout)(retryDelayMs); | |
throw err; | |
} | |
}; | |
// example use of backgroundFunction() with Firebase | |
export const helloPubSub = runWith({ timeoutSeconds: functionTimeoutSec }) | |
.pubsub.topic('topic-name').onPublish((event, context) => | |
backgroundFunction(event, context, async ({ json }) => { | |
// ... handle message json | |
}) | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment