fix: importer..
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import {
|
import {
|
||||||
backfillSessionsToProduction,
|
backfillSessionsToProduction,
|
||||||
|
cleanupSessionStartEndEvents,
|
||||||
cleanupStagingData,
|
cleanupStagingData,
|
||||||
createSessionsStartEndEvents,
|
createSessionsStartEndEvents,
|
||||||
db,
|
db,
|
||||||
@@ -27,7 +28,7 @@ function yieldToEventLoop(): Promise<void> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const PRODUCTION_STEPS = ['moving', 'backfilling_sessions'];
|
const RESUMABLE_STEPS = ['creating_sessions', 'moving', 'backfilling_sessions'];
|
||||||
|
|
||||||
export async function importJob(job: Job<ImportQueuePayload>) {
|
export async function importJob(job: Job<ImportQueuePayload>) {
|
||||||
const { importId } = job.data.payload;
|
const { importId } = job.data.payload;
|
||||||
@@ -45,16 +46,16 @@ export async function importJob(job: Job<ImportQueuePayload>) {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const isRetry = record.currentStep !== null;
|
const isRetry = record.currentStep !== null;
|
||||||
const hasReachedProduction =
|
const canResume =
|
||||||
isRetry && PRODUCTION_STEPS.includes(record.currentStep as string);
|
isRetry && RESUMABLE_STEPS.includes(record.currentStep as string);
|
||||||
|
|
||||||
// -------------------------------------------------------
|
// -------------------------------------------------------
|
||||||
// STAGING PHASE: clean slate on failure, run from scratch
|
// STAGING PHASE: clean slate on failure, run from scratch
|
||||||
// -------------------------------------------------------
|
// -------------------------------------------------------
|
||||||
if (!hasReachedProduction) {
|
if (!canResume) {
|
||||||
if (isRetry) {
|
if (isRetry) {
|
||||||
jobLogger.info(
|
jobLogger.info(
|
||||||
'Retry detected before production phase — cleaning staging data'
|
'Retry detected before resumable phase — cleaning staging data'
|
||||||
);
|
);
|
||||||
await cleanupStagingData(importId);
|
await cleanupStagingData(importId);
|
||||||
}
|
}
|
||||||
@@ -183,8 +184,22 @@ export async function importJob(job: Job<ImportQueuePayload>) {
|
|||||||
await yieldToEventLoop();
|
await yieldToEventLoop();
|
||||||
jobLogger.info('Session ID generation complete');
|
jobLogger.info('Session ID generation complete');
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------
|
||||||
|
// SESSION CREATION PHASE: resumable by cleaning session_start/end
|
||||||
|
// -------------------------------------------------------
|
||||||
|
const skipSessionCreation =
|
||||||
|
canResume && record.currentStep !== 'creating_sessions';
|
||||||
|
|
||||||
|
if (!skipSessionCreation) {
|
||||||
|
if (canResume && record.currentStep === 'creating_sessions') {
|
||||||
|
jobLogger.info(
|
||||||
|
'Retry at creating_sessions — cleaning existing session_start/end events'
|
||||||
|
);
|
||||||
|
await cleanupSessionStartEndEvents(importId);
|
||||||
|
}
|
||||||
|
|
||||||
// Phase 3: Create session_start / session_end events
|
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'creating_sessions',
|
step: 'creating_sessions',
|
||||||
batch: 'all sessions',
|
batch: 'all sessions',
|
||||||
@@ -201,13 +216,15 @@ export async function importJob(job: Job<ImportQueuePayload>) {
|
|||||||
|
|
||||||
// Phase 3: Move staging events to production (per-day)
|
// Phase 3: Move staging events to production (per-day)
|
||||||
const resumeMovingFrom =
|
const resumeMovingFrom =
|
||||||
hasReachedProduction && record.currentStep === 'moving'
|
canResume && record.currentStep === 'moving'
|
||||||
? (record.currentBatch ?? undefined)
|
? (record.currentBatch ?? undefined)
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
// currentBatch is the last successfully completed day — resume from the next day to avoid re-inserting it
|
// currentBatch is the last successfully completed day — resume from the next day to avoid re-inserting it
|
||||||
const moveFromDate = (() => {
|
const moveFromDate = (() => {
|
||||||
if (!resumeMovingFrom) return undefined;
|
if (!resumeMovingFrom) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
const next = new Date(`${resumeMovingFrom}T12:00:00Z`);
|
const next = new Date(`${resumeMovingFrom}T12:00:00Z`);
|
||||||
next.setUTCDate(next.getUTCDate() + 1);
|
next.setUTCDate(next.getUTCDate() + 1);
|
||||||
return next.toISOString().split('T')[0]!;
|
return next.toISOString().split('T')[0]!;
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ import {
|
|||||||
TABLE_NAMES,
|
TABLE_NAMES,
|
||||||
} from '../clickhouse/client';
|
} from '../clickhouse/client';
|
||||||
import { db, type Prisma } from '../prisma-client';
|
import { db, type Prisma } from '../prisma-client';
|
||||||
import type { IClickhouseProfile } from './profile.service';
|
|
||||||
import type { IClickhouseEvent } from './event.service';
|
import type { IClickhouseEvent } from './event.service';
|
||||||
|
import type { IClickhouseProfile } from './profile.service';
|
||||||
|
|
||||||
export interface ImportStageResult {
|
export interface ImportStageResult {
|
||||||
importId: string;
|
importId: string;
|
||||||
@@ -172,38 +172,6 @@ export async function insertProfilesBatch(
|
|||||||
return { inserted: normalized.length };
|
return { inserted: normalized.length };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete all staging data for an import. Used to get a clean slate on retry
|
* Delete all staging data for an import. Used to get a clean slate on retry
|
||||||
* when the failure happened before moving data to production.
|
* when the failure happened before moving data to production.
|
||||||
@@ -222,6 +190,22 @@ export async function cleanupStagingData(importId: string): Promise<void> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function cleanupSessionStartEndEvents(
|
||||||
|
importId: string
|
||||||
|
): Promise<void> {
|
||||||
|
const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports);
|
||||||
|
await ch.command({
|
||||||
|
query: `ALTER TABLE ${mutationTableName} DELETE WHERE import_id = {importId:String} AND name IN ('session_start', 'session_end')`,
|
||||||
|
query_params: { importId },
|
||||||
|
clickhouse_settings: {
|
||||||
|
wait_end_of_query: 1,
|
||||||
|
mutations_sync: '2',
|
||||||
|
send_progress_in_http_headers: 1,
|
||||||
|
http_headers_progress_interval_ms: '50000',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reconstruct sessions across ALL dates for the import.
|
* Reconstruct sessions across ALL dates for the import.
|
||||||
* Each session_id gets exactly one session_start and one session_end,
|
* Each session_id gets exactly one session_start and one session_end,
|
||||||
@@ -242,27 +226,16 @@ export async function createSessionsStartEndEvents(
|
|||||||
"name NOT IN ('session_start', 'session_end')",
|
"name NOT IN ('session_start', 'session_end')",
|
||||||
].join(' AND ');
|
].join(' AND ');
|
||||||
|
|
||||||
|
const sessionBatchSubquery = `
|
||||||
|
(SELECT DISTINCT session_id
|
||||||
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
|
WHERE ${baseWhere}
|
||||||
|
AND session_id > {lastSessionId:String}
|
||||||
|
ORDER BY session_id
|
||||||
|
LIMIT {limit:UInt32})
|
||||||
|
`;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const idsResult = await ch.query({
|
|
||||||
query: `
|
|
||||||
SELECT DISTINCT session_id
|
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
|
||||||
WHERE ${baseWhere}
|
|
||||||
AND session_id > {lastSessionId:String}
|
|
||||||
ORDER BY session_id
|
|
||||||
LIMIT {limit:UInt32}
|
|
||||||
`,
|
|
||||||
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
|
|
||||||
format: 'JSONEachRow',
|
|
||||||
});
|
|
||||||
|
|
||||||
const idRows = (await idsResult.json()) as Array<{ session_id: string }>;
|
|
||||||
if (idRows.length === 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const sessionIds = idRows.map((r) => r.session_id);
|
|
||||||
|
|
||||||
const sessionEventsQuery = `
|
const sessionEventsQuery = `
|
||||||
SELECT
|
SELECT
|
||||||
device_id,
|
device_id,
|
||||||
@@ -279,13 +252,13 @@ export async function createSessionsStartEndEvents(
|
|||||||
max(created_at) AS last_timestamp
|
max(created_at) AS last_timestamp
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
WHERE ${baseWhere}
|
WHERE ${baseWhere}
|
||||||
AND session_id IN ({sessionIds:Array(String)})
|
AND session_id IN ${sessionBatchSubquery}
|
||||||
GROUP BY session_id, device_id, project_id
|
GROUP BY session_id, device_id, project_id
|
||||||
`;
|
`;
|
||||||
|
|
||||||
const sessionEventsResult = await ch.query({
|
const sessionEventsResult = await ch.query({
|
||||||
query: sessionEventsQuery,
|
query: sessionEventsQuery,
|
||||||
query_params: { importId, sessionIds },
|
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
|
||||||
format: 'JSONEachRow',
|
format: 'JSONEachRow',
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -438,8 +411,11 @@ export async function createSessionsStartEndEvents(
|
|||||||
await insertImportBatch(sessionEvents, importId);
|
await insertImportBatch(sessionEvents, importId);
|
||||||
}
|
}
|
||||||
|
|
||||||
lastSessionId = idRows[idRows.length - 1]!.session_id;
|
if (sessionData.length === 0) {
|
||||||
if (idRows.length < SESSION_BATCH_SIZE) {
|
break;
|
||||||
|
}
|
||||||
|
lastSessionId = sessionData.at(-1)!.session_id;
|
||||||
|
if (sessionData.length < SESSION_BATCH_SIZE) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -500,6 +476,15 @@ export async function backfillSessionsToProduction(
|
|||||||
const SESSION_BATCH_SIZE = 5000;
|
const SESSION_BATCH_SIZE = 5000;
|
||||||
let lastSessionId = '';
|
let lastSessionId = '';
|
||||||
|
|
||||||
|
const baseWhere = 'import_id = {importId:String} AND session_id > {lastSessionId:String}';
|
||||||
|
const sessionBatchSubquery = `
|
||||||
|
(SELECT DISTINCT session_id
|
||||||
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
|
WHERE ${baseWhere}
|
||||||
|
ORDER BY session_id
|
||||||
|
LIMIT {limit:UInt32})
|
||||||
|
`;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const idsResult = await ch.query({
|
const idsResult = await ch.query({
|
||||||
query: `
|
query: `
|
||||||
@@ -519,8 +504,6 @@ export async function backfillSessionsToProduction(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const sessionIds = idRows.map((r) => r.session_id);
|
|
||||||
|
|
||||||
const sessionsInsertQuery = `
|
const sessionsInsertQuery = `
|
||||||
INSERT INTO ${TABLE_NAMES.sessions} (
|
INSERT INTO ${TABLE_NAMES.sessions} (
|
||||||
id, project_id, profile_id, device_id, created_at, ended_at,
|
id, project_id, profile_id, device_id, created_at, ended_at,
|
||||||
@@ -577,13 +560,13 @@ export async function backfillSessionsToProduction(
|
|||||||
FROM ${TABLE_NAMES.events_imports} e
|
FROM ${TABLE_NAMES.events_imports} e
|
||||||
WHERE
|
WHERE
|
||||||
e.import_id = {importId:String}
|
e.import_id = {importId:String}
|
||||||
AND e.session_id IN ({sessionIds:Array(String)})
|
AND e.session_id IN ${sessionBatchSubquery}
|
||||||
GROUP BY e.session_id
|
GROUP BY e.session_id
|
||||||
`;
|
`;
|
||||||
|
|
||||||
await ch.command({
|
await ch.command({
|
||||||
query: sessionsInsertQuery,
|
query: sessionsInsertQuery,
|
||||||
query_params: { importId, sessionIds },
|
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
|
||||||
clickhouse_settings: {
|
clickhouse_settings: {
|
||||||
wait_end_of_query: 1,
|
wait_end_of_query: 1,
|
||||||
send_progress_in_http_headers: 1,
|
send_progress_in_http_headers: 1,
|
||||||
@@ -591,7 +574,7 @@ export async function backfillSessionsToProduction(
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
lastSessionId = idRows[idRows.length - 1]!.session_id;
|
lastSessionId = idRows.at(-1)!.session_id;
|
||||||
if (idRows.length < SESSION_BATCH_SIZE) {
|
if (idRows.length < SESSION_BATCH_SIZE) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user