fix: optimize event buffer (#278)
* fix: how we fetch profiles in the buffer * perf: optimize event buffer * remove unused file * fix * wip * wip: try groupmq 2 * try simplified event buffer with duration calculation on the fly instead
This commit is contained in:
committed by
GitHub
parent
4736f8509d
commit
4483e464d1
@@ -2,33 +2,13 @@ import { getSafeJson } from '@openpanel/json';
|
||||
import {
|
||||
type Redis,
|
||||
getRedisCache,
|
||||
getRedisPub,
|
||||
publishEvent,
|
||||
} from '@openpanel/redis';
|
||||
import { ch } from '../clickhouse/client';
|
||||
import {
|
||||
type IClickhouseEvent,
|
||||
type IServiceEvent,
|
||||
transformEvent,
|
||||
} from '../services/event.service';
|
||||
import { type IClickhouseEvent } from '../services/event.service';
|
||||
import { BaseBuffer } from './base-buffer';
|
||||
|
||||
/**
|
||||
* Simplified Event Buffer
|
||||
*
|
||||
* Rules:
|
||||
* 1. All events go into a single list buffer (event_buffer:queue)
|
||||
* 2. screen_view events are handled specially:
|
||||
* - Store current screen_view as "last" for the session
|
||||
* - When a new screen_view arrives, flush the previous one with calculated duration
|
||||
* 3. session_end events:
|
||||
* - Retrieve the last screen_view (don't modify it)
|
||||
* - Push both screen_view and session_end to buffer
|
||||
* 4. Flush: Simply process all events from the list buffer
|
||||
*/
|
||||
|
||||
export class EventBuffer extends BaseBuffer {
|
||||
// Configurable limits
|
||||
private batchSize = process.env.EVENT_BUFFER_BATCH_SIZE
|
||||
? Number.parseInt(process.env.EVENT_BUFFER_BATCH_SIZE, 10)
|
||||
: 4000;
|
||||
@@ -36,124 +16,26 @@ export class EventBuffer extends BaseBuffer {
|
||||
? Number.parseInt(process.env.EVENT_BUFFER_CHUNK_SIZE, 10)
|
||||
: 1000;
|
||||
|
||||
private microBatchIntervalMs = process.env.EVENT_BUFFER_MICRO_BATCH_MS
|
||||
? Number.parseInt(process.env.EVENT_BUFFER_MICRO_BATCH_MS, 10)
|
||||
: 10;
|
||||
private microBatchMaxSize = process.env.EVENT_BUFFER_MICRO_BATCH_SIZE
|
||||
? Number.parseInt(process.env.EVENT_BUFFER_MICRO_BATCH_SIZE, 10)
|
||||
: 100;
|
||||
|
||||
private pendingEvents: IClickhouseEvent[] = [];
|
||||
private flushTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
private isFlushing = false;
|
||||
/** Tracks consecutive flush failures for observability; reset on success. */
|
||||
private flushRetryCount = 0;
|
||||
|
||||
private activeVisitorsExpiration = 60 * 5; // 5 minutes
|
||||
|
||||
// LIST - Stores all events ready to be flushed
|
||||
/** How often (ms) we refresh the heartbeat key + zadd per visitor. */
|
||||
private heartbeatRefreshMs = 60_000; // 1 minute
|
||||
private lastHeartbeat = new Map<string, number>();
|
||||
private queueKey = 'event_buffer:queue';
|
||||
|
||||
// STRING - Tracks total buffer size incrementally
|
||||
protected bufferCounterKey = 'event_buffer:total_count';
|
||||
|
||||
// Script SHAs for loaded Lua scripts
|
||||
private scriptShas: {
|
||||
addScreenView?: string;
|
||||
addSessionEnd?: string;
|
||||
} = {};
|
||||
|
||||
// Hash key for storing last screen_view per session
|
||||
private getLastScreenViewKeyBySession(sessionId: string) {
|
||||
return `event_buffer:last_screen_view:session:${sessionId}`;
|
||||
}
|
||||
|
||||
// Hash key for storing last screen_view per profile
|
||||
private getLastScreenViewKeyByProfile(projectId: string, profileId: string) {
|
||||
return `event_buffer:last_screen_view:profile:${projectId}:${profileId}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lua script for handling screen_view addition - RACE-CONDITION SAFE without GroupMQ
|
||||
*
|
||||
* Strategy: Use Redis GETDEL (atomic get-and-delete) to ensure only ONE thread
|
||||
* can process the "last" screen_view at a time.
|
||||
*
|
||||
* KEYS[1] = last screen_view key (by session) - stores both event and timestamp as JSON
|
||||
* KEYS[2] = last screen_view key (by profile, may be empty)
|
||||
* KEYS[3] = queue key
|
||||
* KEYS[4] = buffer counter key
|
||||
* ARGV[1] = new event with timestamp as JSON: {"event": {...}, "ts": 123456}
|
||||
* ARGV[2] = TTL for last screen_view (1 hour)
|
||||
*/
|
||||
private readonly addScreenViewScript = `
|
||||
local sessionKey = KEYS[1]
|
||||
local profileKey = KEYS[2]
|
||||
local queueKey = KEYS[3]
|
||||
local counterKey = KEYS[4]
|
||||
local newEventData = ARGV[1]
|
||||
local ttl = tonumber(ARGV[2])
|
||||
|
||||
-- GETDEL is atomic: get previous and delete in one operation
|
||||
-- This ensures only ONE thread gets the previous event
|
||||
local previousEventData = redis.call("GETDEL", sessionKey)
|
||||
|
||||
-- Store new screen_view as last for session
|
||||
redis.call("SET", sessionKey, newEventData, "EX", ttl)
|
||||
|
||||
-- Store new screen_view as last for profile (if key provided)
|
||||
if profileKey and profileKey ~= "" then
|
||||
redis.call("SET", profileKey, newEventData, "EX", ttl)
|
||||
end
|
||||
|
||||
-- If there was a previous screen_view, add it to queue with calculated duration
|
||||
if previousEventData then
|
||||
local prev = cjson.decode(previousEventData)
|
||||
local curr = cjson.decode(newEventData)
|
||||
|
||||
-- Calculate duration (ensure non-negative to handle clock skew)
|
||||
if prev.ts and curr.ts then
|
||||
prev.event.duration = math.max(0, curr.ts - prev.ts)
|
||||
end
|
||||
|
||||
redis.call("RPUSH", queueKey, cjson.encode(prev.event))
|
||||
redis.call("INCR", counterKey)
|
||||
return 1
|
||||
end
|
||||
|
||||
return 0
|
||||
`;
|
||||
|
||||
/**
|
||||
* Lua script for handling session_end - RACE-CONDITION SAFE
|
||||
*
|
||||
* Uses GETDEL to atomically retrieve and delete the last screen_view
|
||||
*
|
||||
* KEYS[1] = last screen_view key (by session)
|
||||
* KEYS[2] = last screen_view key (by profile, may be empty)
|
||||
* KEYS[3] = queue key
|
||||
* KEYS[4] = buffer counter key
|
||||
* ARGV[1] = session_end event JSON
|
||||
*/
|
||||
private readonly addSessionEndScript = `
|
||||
local sessionKey = KEYS[1]
|
||||
local profileKey = KEYS[2]
|
||||
local queueKey = KEYS[3]
|
||||
local counterKey = KEYS[4]
|
||||
local sessionEndJson = ARGV[1]
|
||||
|
||||
-- GETDEL is atomic: only ONE thread gets the last screen_view
|
||||
local previousEventData = redis.call("GETDEL", sessionKey)
|
||||
local added = 0
|
||||
|
||||
-- If there was a previous screen_view, add it to queue
|
||||
if previousEventData then
|
||||
local prev = cjson.decode(previousEventData)
|
||||
redis.call("RPUSH", queueKey, cjson.encode(prev.event))
|
||||
redis.call("INCR", counterKey)
|
||||
added = added + 1
|
||||
end
|
||||
|
||||
-- Add session_end to queue
|
||||
redis.call("RPUSH", queueKey, sessionEndJson)
|
||||
redis.call("INCR", counterKey)
|
||||
added = added + 1
|
||||
|
||||
-- Delete profile key
|
||||
if profileKey and profileKey ~= "" then
|
||||
redis.call("DEL", profileKey)
|
||||
end
|
||||
|
||||
return added
|
||||
`;
|
||||
|
||||
constructor() {
|
||||
super({
|
||||
name: 'event',
|
||||
@@ -161,170 +43,97 @@ return added
|
||||
await this.processBuffer();
|
||||
},
|
||||
});
|
||||
// Load Lua scripts into Redis on startup
|
||||
this.loadScripts();
|
||||
}
|
||||
|
||||
/**
|
||||
* Load Lua scripts into Redis and cache their SHAs.
|
||||
* This avoids sending the entire script on every call.
|
||||
*/
|
||||
private async loadScripts() {
|
||||
try {
|
||||
const redis = getRedisCache();
|
||||
const [screenViewSha, sessionEndSha] = await Promise.all([
|
||||
redis.script('LOAD', this.addScreenViewScript),
|
||||
redis.script('LOAD', this.addSessionEndScript),
|
||||
]);
|
||||
|
||||
this.scriptShas.addScreenView = screenViewSha as string;
|
||||
this.scriptShas.addSessionEnd = sessionEndSha as string;
|
||||
|
||||
this.logger.info('Loaded Lua scripts into Redis', {
|
||||
addScreenView: this.scriptShas.addScreenView,
|
||||
addSessionEnd: this.scriptShas.addSessionEnd,
|
||||
});
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to load Lua scripts', { error });
|
||||
}
|
||||
}
|
||||
|
||||
bulkAdd(events: IClickhouseEvent[]) {
|
||||
const redis = getRedisCache();
|
||||
const multi = redis.multi();
|
||||
for (const event of events) {
|
||||
this.add(event, multi);
|
||||
this.add(event);
|
||||
}
|
||||
return multi.exec();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an event into Redis buffer.
|
||||
*
|
||||
* Logic:
|
||||
* - screen_view: Store as "last" for session, flush previous if exists
|
||||
* - session_end: Flush last screen_view + session_end
|
||||
* - Other events: Add directly to queue
|
||||
*/
|
||||
async add(event: IClickhouseEvent, _multi?: ReturnType<Redis['multi']>) {
|
||||
add(event: IClickhouseEvent) {
|
||||
this.pendingEvents.push(event);
|
||||
|
||||
if (this.pendingEvents.length >= this.microBatchMaxSize) {
|
||||
this.flushLocalBuffer();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.flushTimer) {
|
||||
this.flushTimer = setTimeout(() => {
|
||||
this.flushTimer = null;
|
||||
this.flushLocalBuffer();
|
||||
}, this.microBatchIntervalMs);
|
||||
}
|
||||
}
|
||||
|
||||
public async flush() {
|
||||
if (this.flushTimer) {
|
||||
clearTimeout(this.flushTimer);
|
||||
this.flushTimer = null;
|
||||
}
|
||||
await this.flushLocalBuffer();
|
||||
}
|
||||
|
||||
private async flushLocalBuffer() {
|
||||
if (this.isFlushing || this.pendingEvents.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.isFlushing = true;
|
||||
|
||||
const eventsToFlush = this.pendingEvents;
|
||||
this.pendingEvents = [];
|
||||
|
||||
try {
|
||||
const redis = getRedisCache();
|
||||
const eventJson = JSON.stringify(event);
|
||||
const multi = _multi || redis.multi();
|
||||
const multi = redis.multi();
|
||||
|
||||
if (event.session_id && event.name === 'screen_view') {
|
||||
// Handle screen_view
|
||||
const sessionKey = this.getLastScreenViewKeyBySession(event.session_id);
|
||||
const profileKey = event.profile_id
|
||||
? this.getLastScreenViewKeyByProfile(
|
||||
event.project_id,
|
||||
event.profile_id,
|
||||
)
|
||||
: '';
|
||||
const timestamp = new Date(event.created_at || Date.now()).getTime();
|
||||
|
||||
// Combine event and timestamp into single JSON for atomic operations
|
||||
const eventWithTimestamp = JSON.stringify({
|
||||
event: event,
|
||||
ts: timestamp,
|
||||
});
|
||||
|
||||
this.evalScript(
|
||||
multi,
|
||||
'addScreenView',
|
||||
this.addScreenViewScript,
|
||||
4,
|
||||
sessionKey,
|
||||
profileKey,
|
||||
this.queueKey,
|
||||
this.bufferCounterKey,
|
||||
eventWithTimestamp,
|
||||
'3600', // 1 hour TTL
|
||||
);
|
||||
} else if (event.session_id && event.name === 'session_end') {
|
||||
// Handle session_end
|
||||
const sessionKey = this.getLastScreenViewKeyBySession(event.session_id);
|
||||
const profileKey = event.profile_id
|
||||
? this.getLastScreenViewKeyByProfile(
|
||||
event.project_id,
|
||||
event.profile_id,
|
||||
)
|
||||
: '';
|
||||
|
||||
this.evalScript(
|
||||
multi,
|
||||
'addSessionEnd',
|
||||
this.addSessionEndScript,
|
||||
4,
|
||||
sessionKey,
|
||||
profileKey,
|
||||
this.queueKey,
|
||||
this.bufferCounterKey,
|
||||
eventJson,
|
||||
);
|
||||
} else {
|
||||
// All other events go directly to queue
|
||||
multi.rpush(this.queueKey, eventJson).incr(this.bufferCounterKey);
|
||||
for (const event of eventsToFlush) {
|
||||
multi.rpush(this.queueKey, JSON.stringify(event));
|
||||
if (event.profile_id) {
|
||||
this.incrementActiveVisitorCount(
|
||||
multi,
|
||||
event.project_id,
|
||||
event.profile_id,
|
||||
);
|
||||
}
|
||||
}
|
||||
multi.incrby(this.bufferCounterKey, eventsToFlush.length);
|
||||
|
||||
if (event.profile_id) {
|
||||
this.incrementActiveVisitorCount(
|
||||
multi,
|
||||
event.project_id,
|
||||
event.profile_id,
|
||||
);
|
||||
}
|
||||
await multi.exec();
|
||||
|
||||
if (!_multi) {
|
||||
await multi.exec();
|
||||
}
|
||||
|
||||
await publishEvent('events', 'received', transformEvent(event));
|
||||
this.flushRetryCount = 0;
|
||||
this.pruneHeartbeatMap();
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to add event to Redis buffer', { error });
|
||||
// Re-queue failed events at the front to preserve order and avoid data loss
|
||||
this.pendingEvents = eventsToFlush.concat(this.pendingEvents);
|
||||
|
||||
this.flushRetryCount += 1;
|
||||
this.logger.warn(
|
||||
'Failed to flush local buffer to Redis; events re-queued',
|
||||
{
|
||||
error,
|
||||
eventCount: eventsToFlush.length,
|
||||
flushRetryCount: this.flushRetryCount,
|
||||
},
|
||||
);
|
||||
} finally {
|
||||
this.isFlushing = false;
|
||||
// Events may have accumulated while we were flushing; schedule another flush if needed
|
||||
if (this.pendingEvents.length > 0 && !this.flushTimer) {
|
||||
this.flushTimer = setTimeout(() => {
|
||||
this.flushTimer = null;
|
||||
this.flushLocalBuffer();
|
||||
}, this.microBatchIntervalMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a Lua script using EVALSHA (cached) or fallback to EVAL.
|
||||
* This avoids sending the entire script on every call.
|
||||
*/
|
||||
private evalScript(
|
||||
multi: ReturnType<Redis['multi']>,
|
||||
scriptName: keyof typeof this.scriptShas,
|
||||
scriptContent: string,
|
||||
numKeys: number,
|
||||
...args: (string | number)[]
|
||||
) {
|
||||
const sha = this.scriptShas[scriptName];
|
||||
|
||||
if (sha) {
|
||||
// Use EVALSHA with cached SHA
|
||||
multi.evalsha(sha, numKeys, ...args);
|
||||
} else {
|
||||
// Fallback to EVAL and try to reload script
|
||||
multi.eval(scriptContent, numKeys, ...args);
|
||||
this.logger.warn(`Script ${scriptName} not loaded, using EVAL fallback`);
|
||||
// Attempt to reload scripts in background
|
||||
this.loadScripts();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the Redis buffer - simplified version.
|
||||
*
|
||||
* Simply:
|
||||
* 1. Fetch events from the queue (up to batchSize)
|
||||
* 2. Parse and sort them
|
||||
* 3. Insert into ClickHouse in chunks
|
||||
* 4. Publish saved events
|
||||
* 5. Clean up processed events from queue
|
||||
*/
|
||||
async processBuffer() {
|
||||
const redis = getRedisCache();
|
||||
|
||||
try {
|
||||
// Fetch events from queue
|
||||
const queueEvents = await redis.lrange(
|
||||
this.queueKey,
|
||||
0,
|
||||
@@ -336,7 +145,6 @@ return added
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse events
|
||||
const eventsToClickhouse: IClickhouseEvent[] = [];
|
||||
for (const eventStr of queueEvents) {
|
||||
const event = getSafeJson<IClickhouseEvent>(eventStr);
|
||||
@@ -350,14 +158,12 @@ return added
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort events by creation time
|
||||
eventsToClickhouse.sort(
|
||||
(a, b) =>
|
||||
new Date(a.created_at || 0).getTime() -
|
||||
new Date(b.created_at || 0).getTime(),
|
||||
);
|
||||
|
||||
// Insert events into ClickHouse in chunks
|
||||
this.logger.info('Inserting events into ClickHouse', {
|
||||
totalEvents: eventsToClickhouse.length,
|
||||
chunks: Math.ceil(eventsToClickhouse.length / this.chunkSize),
|
||||
@@ -371,14 +177,17 @@ return added
|
||||
});
|
||||
}
|
||||
|
||||
// Publish "saved" events
|
||||
const pubMulti = getRedisPub().multi();
|
||||
const countByProject = new Map<string, number>();
|
||||
for (const event of eventsToClickhouse) {
|
||||
await publishEvent('events', 'saved', transformEvent(event), pubMulti);
|
||||
countByProject.set(
|
||||
event.project_id,
|
||||
(countByProject.get(event.project_id) ?? 0) + 1,
|
||||
);
|
||||
}
|
||||
for (const [projectId, count] of countByProject) {
|
||||
publishEvent('events', 'batch', { projectId, count });
|
||||
}
|
||||
await pubMulti.exec();
|
||||
|
||||
// Clean up processed events from queue
|
||||
await redis
|
||||
.multi()
|
||||
.ltrim(this.queueKey, queueEvents.length, -1)
|
||||
@@ -394,45 +203,6 @@ return added
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the latest screen_view event for a given session or profile
|
||||
*/
|
||||
public async getLastScreenView(
|
||||
params:
|
||||
| {
|
||||
sessionId: string;
|
||||
}
|
||||
| {
|
||||
projectId: string;
|
||||
profileId: string;
|
||||
},
|
||||
): Promise<IServiceEvent | null> {
|
||||
const redis = getRedisCache();
|
||||
|
||||
let lastScreenViewKey: string;
|
||||
if ('sessionId' in params) {
|
||||
lastScreenViewKey = this.getLastScreenViewKeyBySession(params.sessionId);
|
||||
} else {
|
||||
lastScreenViewKey = this.getLastScreenViewKeyByProfile(
|
||||
params.projectId,
|
||||
params.profileId,
|
||||
);
|
||||
}
|
||||
|
||||
const eventDataStr = await redis.get(lastScreenViewKey);
|
||||
|
||||
if (eventDataStr) {
|
||||
const eventData = getSafeJson<{ event: IClickhouseEvent; ts: number }>(
|
||||
eventDataStr,
|
||||
);
|
||||
if (eventData?.event) {
|
||||
return transformEvent(eventData.event);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public async getBufferSize() {
|
||||
return this.getBufferSizeWithCounter(async () => {
|
||||
const redis = getRedisCache();
|
||||
@@ -440,16 +210,32 @@ return added
|
||||
});
|
||||
}
|
||||
|
||||
private async incrementActiveVisitorCount(
|
||||
private pruneHeartbeatMap() {
|
||||
const cutoff = Date.now() - this.activeVisitorsExpiration * 1000;
|
||||
for (const [key, ts] of this.lastHeartbeat) {
|
||||
if (ts < cutoff) {
|
||||
this.lastHeartbeat.delete(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private incrementActiveVisitorCount(
|
||||
multi: ReturnType<Redis['multi']>,
|
||||
projectId: string,
|
||||
profileId: string,
|
||||
) {
|
||||
// Track active visitors and emit expiry events when inactive for TTL
|
||||
const key = `${projectId}:${profileId}`;
|
||||
const now = Date.now();
|
||||
const last = this.lastHeartbeat.get(key) ?? 0;
|
||||
|
||||
if (now - last < this.heartbeatRefreshMs) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.lastHeartbeat.set(key, now);
|
||||
const zsetKey = `live:visitors:${projectId}`;
|
||||
const heartbeatKey = `live:visitor:${projectId}:${profileId}`;
|
||||
return multi
|
||||
multi
|
||||
.zadd(zsetKey, now, profileId)
|
||||
.set(heartbeatKey, '1', 'EX', this.activeVisitorsExpiration);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user