Skip to content

Commit 0586258

Browse files
Remove warmup from distributed keynote bench (#4757)
# Description of Changes Replaces the warmup period in the distributed version of the `keynote-2` benchmark with an explicit start barrier. 1. Removes `--warmup-seconds` from the distributed benchmark flow 2. Adds an explicit `starting` phase where generators start their local epoch and POST `/started` 3. Makes the coordinator wait for all participant start acknowledgements before beginning the measured window 4. Adds `--start-ack-timeout-seconds` as the timeout for that start barrier 5. Removes `warmupSeconds` from the distributed benchmark protocol/result types # API and ABI breaking changes N/A # Expected complexity level and risk 1.5 # Testing N/A
1 parent 29a9d06 commit 0586258

File tree

6 files changed

+102
-31
lines changed

6 files changed

+102
-31
lines changed

templates/keynote-2/DEVELOP.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,6 @@ cd templates/keynote-2
256256
pnpm run bench-dist-coordinator -- \
257257
--test test-1 \
258258
--connector spacetimedb \
259-
--warmup-seconds 15 \
260259
--window-seconds 30 \
261260
--verify 1 \
262261
--stdb-url ws://127.0.0.1:3000 \
@@ -268,9 +267,10 @@ pnpm run bench-dist-coordinator -- \
268267

269268
Notes:
270269

271-
- `--warmup-seconds` is the unmeasured warmup period. Generators submit requests during warmup, but those transactions are excluded from TPS.
270+
- Before measurement begins, the coordinator waits for every participating generator to start its epoch and acknowledge that it is running.
272271
- `--window-seconds` is the measured interval.
273272
- `--verify 1` preserves the existing benchmark semantics by running one verification pass centrally after the epoch completes.
273+
- If a generator never acknowledges start, the coordinator fails the epoch after `--start-ack-timeout-seconds` seconds. The default is `60`.
274274
- The coordinator derives the HTTP metrics endpoint from `--stdb-url` by switching to `http://` or `https://` and appending `/v1/metrics`.
275275
- For a real multi-machine run, change `--bind 127.0.0.1` to `--bind 0.0.0.0` so remote generators can reach the coordinator.
276276
- For a real multi-machine run, set `--stdb-url` to the server machine's reachable address.
@@ -367,8 +367,8 @@ The result contains:
367367
#### Operational notes
368368

369369
- Start the coordinator before the generators.
370-
- Generators begin submitting requests when the coordinator enters `warmup`, not when the measured window begins.
371-
- Throughput is measured only from the committed transaction counter delta recorded after warmup, so warmup transactions are excluded.
370+
- Generators begin submitting requests when the coordinator enters `starting`.
371+
- Throughput is measured only from the committed transaction counter delta recorded after all participating generators have acknowledged start, so startup traffic is excluded.
372372
- For this distributed TypeScript mode, each connection runs closed-loop with one request at a time. There is no pipelining in this flow.
373373
- Late generators are allowed to register and become ready while an epoch is already running, but they only participate in the next epoch.
374374
- The coordinator does not use heartbeats. It includes generators that most recently reported `ready`.

templates/keynote-2/src/core/runner.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -376,11 +376,6 @@ export async function runOne({
376376
return { start, completedWithinWindow, completedTotal, committedDelta };
377377
};
378378

379-
// const warmUpSeconds = 5;
380-
// console.log(`[${connector.name}] Warming up for ${warmUpSeconds}s...`);
381-
// await run(warmUpSeconds);
382-
// console.log(`[${connector.name}] Finished warmup.`);
383-
384379
console.log(`[${connector.name}] Starting workers for ${seconds}s run...`);
385380

386381
const { start, completedWithinWindow, completedTotal, committedDelta } =

templates/keynote-2/src/core/spacetimeMetrics.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,26 @@ function formatErrorWithCause(err: unknown): string {
1212
return `${err.message}${cause}`;
1313
}
1414

15-
export async function fetchMetrics(url: string): Promise<string> {
15+
async function fetchText(url: string, label: string): Promise<string> {
1616
let res: Response;
1717
try {
1818
res = await fetch(url);
1919
} catch (err) {
20-
throw new Error(`metrics GET ${url} failed: ${formatErrorWithCause(err)}`);
20+
throw new Error(`${label} GET ${url} failed: ${formatErrorWithCause(err)}`);
2121
}
2222

2323
if (!res.ok) {
2424
throw new Error(
25-
`metrics GET ${url} failed: ${res.status} ${res.statusText}`,
25+
`${label} GET ${url} failed: ${res.status} ${res.statusText}`,
2626
);
2727
}
2828
return await res.text();
2929
}
3030

31+
export async function fetchMetrics(url: string): Promise<string> {
32+
return await fetchText(url, 'metrics');
33+
}
34+
3135
export function parseMetricCounter(
3236
body: string,
3337
metricName: string,

templates/keynote-2/src/distributed/coordinator.ts

Lines changed: 65 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import type {
2929
RegisterRequest,
3030
StartEpochRequest,
3131
StartEpochResponse,
32+
StartedRequest,
3233
StoppedRequest,
3334
} from './protocol.ts';
3435
import { isoNow, sleep, writeJsonFile } from './util.ts';
@@ -47,6 +48,7 @@ type ActiveEpoch = {
4748
label: string | null;
4849
participantIds: string[];
4950
participantConnections: number;
51+
startedAcks: Set<string>;
5052
stopAcks: Set<string>;
5153
};
5254

@@ -110,7 +112,7 @@ async function runVerification(
110112
class DistributedCoordinator {
111113
private readonly testName: string;
112114
private readonly connectorName: string;
113-
private readonly warmupMs: number;
115+
private readonly startAckTimeoutMs: number;
114116
private readonly windowMs: number;
115117
private readonly verifyAfterEpoch: boolean;
116118
private readonly stopAckTimeoutMs: number;
@@ -129,7 +131,7 @@ class DistributedCoordinator {
129131
constructor(opts: {
130132
testName: string;
131133
connectorName: string;
132-
warmupMs: number;
134+
startAckTimeoutMs: number;
133135
windowMs: number;
134136
verifyAfterEpoch: boolean;
135137
stopAckTimeoutMs: number;
@@ -140,7 +142,7 @@ class DistributedCoordinator {
140142
}) {
141143
this.testName = opts.testName;
142144
this.connectorName = opts.connectorName;
143-
this.warmupMs = opts.warmupMs;
145+
this.startAckTimeoutMs = opts.startAckTimeoutMs;
144146
this.windowMs = opts.windowMs;
145147
this.verifyAfterEpoch = opts.verifyAfterEpoch;
146148
this.stopAckTimeoutMs = opts.stopAckTimeoutMs;
@@ -194,6 +196,24 @@ class DistributedCoordinator {
194196
return this.snapshot();
195197
}
196198

199+
started(body: StartedRequest): CoordinatorState {
200+
const generator = this.requireGenerator(body.id);
201+
if (!this.currentEpoch || body.epoch !== this.currentEpoch.epoch) {
202+
throw new Error(
203+
`Generator "${body.id}" acknowledged unexpected epoch ${body.epoch}`,
204+
);
205+
}
206+
if (generator.activeEpoch !== body.epoch) {
207+
throw new Error(
208+
`Generator "${body.id}" is not assigned to epoch ${body.epoch}`,
209+
);
210+
}
211+
212+
generator.localState = 'running';
213+
this.currentEpoch.startedAcks.add(body.id);
214+
return this.snapshot();
215+
}
216+
197217
stopped(body: StoppedRequest): CoordinatorState {
198218
const generator = this.requireGenerator(body.id);
199219
generator.localState = 'ready';
@@ -252,18 +272,19 @@ class DistributedCoordinator {
252272
(sum, generator) => sum + generator.openedConnections,
253273
0,
254274
),
275+
startedAcks: new Set<string>(),
255276
stopAcks: new Set<string>(),
256277
};
257278

258279
for (const participantId of activeEpoch.participantIds) {
259280
const generator = this.generators.get(participantId);
260281
if (!generator) continue;
261-
generator.localState = 'running';
282+
generator.localState = 'starting';
262283
generator.activeEpoch = activeEpoch.epoch;
263284
}
264285

265286
this.currentEpoch = activeEpoch;
266-
this.phase = 'warmup';
287+
this.phase = 'starting';
267288
this.epochTask = this.runEpoch(activeEpoch)
268289
.catch((err) => {
269290
const msg = err instanceof Error ? err.message : String(err);
@@ -297,9 +318,14 @@ class DistributedCoordinator {
297318

298319
try {
299320
console.log(
300-
`[coordinator] epoch ${activeEpoch.epoch} warmup for ${(this.warmupMs / 1000).toFixed(1)}s`,
321+
`[coordinator] epoch ${activeEpoch.epoch} waiting for start acknowledgements from ${activeEpoch.participantIds.length} generators`,
301322
);
302-
await sleep(this.warmupMs);
323+
const pendingStarts = await this.waitForStarts(activeEpoch);
324+
if (pendingStarts.length > 0) {
325+
throw new Error(
326+
`Missing start acknowledgements from: ${pendingStarts.join(', ')}`,
327+
);
328+
}
303329

304330
const before = await getSpacetimeCommittedTransfers(this.stdbUrl);
305331
if (before == null) {
@@ -379,7 +405,6 @@ class DistributedCoordinator {
379405
label: activeEpoch.label,
380406
test: this.testName,
381407
connector: this.connectorName,
382-
warmupSeconds: this.warmupMs / 1000,
383408
windowSeconds: this.windowMs / 1000,
384409
actualWindowSeconds,
385410
participantIds: activeEpoch.participantIds,
@@ -405,6 +430,25 @@ class DistributedCoordinator {
405430
console.log(`[coordinator] wrote epoch ${result.epoch} result to ${outPath}`);
406431
}
407432

433+
private async waitForStarts(activeEpoch: ActiveEpoch): Promise<string[]> {
434+
const deadline = Date.now() + this.startAckTimeoutMs;
435+
436+
while (Date.now() < deadline) {
437+
if (activeEpoch.startedAcks.size >= activeEpoch.participantIds.length) {
438+
return [];
439+
}
440+
await sleep(250);
441+
}
442+
443+
const pending = activeEpoch.participantIds.filter(
444+
(id) => !activeEpoch.startedAcks.has(id),
445+
);
446+
console.warn(
447+
`[coordinator] start acknowledgements timed out for epoch ${activeEpoch.epoch}: ${pending.join(', ')}`,
448+
);
449+
return pending;
450+
}
451+
408452
private async waitForStops(activeEpoch: ActiveEpoch): Promise<string[]> {
409453
const deadline = Date.now() + this.stopAckTimeoutMs;
410454

@@ -450,7 +494,11 @@ async function main(): Promise<void> {
450494
new URL('../../runs/distributed/', import.meta.url),
451495
);
452496
const resultsDir = getStringFlag(flags, 'results-dir', defaultResultsDir);
453-
const warmupSeconds = getNumberFlag(flags, 'warmup-seconds', 15);
497+
const startAckTimeoutSeconds = getNumberFlag(
498+
flags,
499+
'start-ack-timeout-seconds',
500+
60,
501+
);
454502
const windowSeconds = getNumberFlag(flags, 'window-seconds', 60);
455503
const stopAckTimeoutSeconds = getNumberFlag(
456504
flags,
@@ -479,7 +527,7 @@ async function main(): Promise<void> {
479527
const coordinator = new DistributedCoordinator({
480528
testName,
481529
connectorName,
482-
warmupMs: warmupSeconds * 1000,
530+
startAckTimeoutMs: startAckTimeoutSeconds * 1000,
483531
windowMs: windowSeconds * 1000,
484532
verifyAfterEpoch,
485533
stopAckTimeoutMs: stopAckTimeoutSeconds * 1000,
@@ -517,6 +565,12 @@ async function main(): Promise<void> {
517565
return;
518566
}
519567

568+
if (method === 'POST' && path === '/started') {
569+
const body = await readJsonBody<StartedRequest>(req);
570+
json(res, 200, coordinator.started(body));
571+
return;
572+
}
573+
520574
if (method === 'POST' && path === '/stopped') {
521575
const body = await readJsonBody<StoppedRequest>(req);
522576
json(res, 200, coordinator.stopped(body));
@@ -550,7 +604,7 @@ async function main(): Promise<void> {
550604
});
551605

552606
console.log(
553-
`[coordinator] listening on http://${bind}:${port} test=${testName} connector=${connectorName} warmup=${warmupSeconds}s window=${windowSeconds}s verify=${verifyAfterEpoch ? 'on' : 'off'} stdb=${stdbUrl} compression=${stdbCompression}`,
607+
`[coordinator] listening on http://${bind}:${port} test=${testName} connector=${connectorName} start_ack_timeout=${startAckTimeoutSeconds}s window=${windowSeconds}s verify=${verifyAfterEpoch ? 'on' : 'off'} stdb=${stdbUrl} compression=${stdbCompression}`,
554608
);
555609
}
556610

templates/keynote-2/src/distributed/generator.ts

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,18 @@ async function main(): Promise<void> {
8989
let activeEpoch: number | null = null;
9090
let stopping = false;
9191

92+
const startActiveEpoch = async (epoch: number) => {
93+
console.log(`[generator ${id}] starting epoch ${epoch}`);
94+
await session.startEpoch(epoch);
95+
activeEpoch = epoch;
96+
await retryUntilSuccess('[generator] started', async () => {
97+
await postJson<CoordinatorState>(coordinatorUrl, '/started', {
98+
id,
99+
epoch,
100+
});
101+
}, pollMs, controlRetries, () => !stopping);
102+
};
103+
92104
const stopActiveEpoch = async () => {
93105
if (activeEpoch == null) return;
94106

@@ -147,17 +159,15 @@ async function main(): Promise<void> {
147159
state.participants.includes(id);
148160
const shouldKeepRunning =
149161
isParticipant &&
150-
(state.phase === 'warmup' || state.phase === 'measure');
162+
(state.phase === 'starting' || state.phase === 'measure');
151163

152164
if (!activeEpoch) {
153165
if (
154-
state.phase === 'warmup' &&
166+
state.phase === 'starting' &&
155167
state.currentEpoch != null &&
156168
state.participants.includes(id)
157169
) {
158-
console.log(`[generator ${id}] starting epoch ${state.currentEpoch}`);
159-
await session.startEpoch(state.currentEpoch);
160-
activeEpoch = state.currentEpoch;
170+
await startActiveEpoch(state.currentEpoch);
161171
}
162172
} else if (!shouldKeepRunning) {
163173
await stopActiveEpoch();

templates/keynote-2/src/distributed/protocol.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1-
export type GeneratorLocalState = 'registered' | 'ready' | 'running';
1+
export type GeneratorLocalState =
2+
| 'registered'
3+
| 'ready'
4+
| 'starting'
5+
| 'running';
26

3-
export type CoordinatorPhase = 'idle' | 'warmup' | 'measure' | 'stop';
7+
export type CoordinatorPhase = 'idle' | 'starting' | 'measure' | 'stop';
48

59
export type GeneratorSnapshot = {
610
id: string;
@@ -16,7 +20,6 @@ export type EpochResult = {
1620
label: string | null;
1721
test: string;
1822
connector: string;
19-
warmupSeconds: number;
2023
windowSeconds: number;
2124
actualWindowSeconds: number;
2225
participantIds: string[];
@@ -54,6 +57,11 @@ export type ReadyRequest = {
5457
openedConnections: number;
5558
};
5659

60+
export type StartedRequest = {
61+
id: string;
62+
epoch: number;
63+
};
64+
5765
export type StoppedRequest = {
5866
id: string;
5967
epoch: number;

0 commit comments

Comments
 (0)