fix: redo how the importer works
This commit is contained in:
@@ -1,17 +1,16 @@
|
|||||||
import {
|
import {
|
||||||
type IClickhouseEvent,
|
|
||||||
type ImportSteps,
|
|
||||||
type Prisma,
|
|
||||||
backfillSessionsToProduction,
|
backfillSessionsToProduction,
|
||||||
|
cleanupStagingData,
|
||||||
createSessionsStartEndEvents,
|
createSessionsStartEndEvents,
|
||||||
db,
|
db,
|
||||||
formatClickhouseDate,
|
generateGapBasedSessionIds,
|
||||||
generateSessionIds,
|
|
||||||
getImportDateBounds,
|
getImportDateBounds,
|
||||||
getImportProgress,
|
type IClickhouseEvent,
|
||||||
|
type IClickhouseProfile,
|
||||||
insertImportBatch,
|
insertImportBatch,
|
||||||
markImportComplete,
|
insertProfilesBatch,
|
||||||
moveImportsToProduction,
|
moveImportsToProduction,
|
||||||
|
type Prisma,
|
||||||
updateImportStatus,
|
updateImportStatus,
|
||||||
} from '@openpanel/db';
|
} from '@openpanel/db';
|
||||||
import { MixpanelProvider, UmamiProvider } from '@openpanel/importer';
|
import { MixpanelProvider, UmamiProvider } from '@openpanel/importer';
|
||||||
@@ -22,294 +21,245 @@ import { logger } from '../utils/logger';
|
|||||||
|
|
||||||
const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10);
|
const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10);
|
||||||
|
|
||||||
/**
|
function yieldToEventLoop(): Promise<void> {
|
||||||
* Yields control back to the event loop to prevent stalled jobs
|
|
||||||
*/
|
|
||||||
async function yieldToEventLoop(): Promise<void> {
|
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
setTimeout(resolve, 100);
|
setTimeout(resolve, 100);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const PRODUCTION_STEPS = ['moving', 'backfilling_sessions'];
|
||||||
|
|
||||||
export async function importJob(job: Job<ImportQueuePayload>) {
|
export async function importJob(job: Job<ImportQueuePayload>) {
|
||||||
const { importId } = job.data.payload;
|
const { importId } = job.data.payload;
|
||||||
|
|
||||||
const record = await db.$primary().import.findUniqueOrThrow({
|
const record = await db.$primary().import.findUniqueOrThrow({
|
||||||
where: { id: importId },
|
where: { id: importId },
|
||||||
include: {
|
include: { project: true },
|
||||||
project: true,
|
|
||||||
},
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const jobLogger = logger.child({
|
const jobLogger = logger.child({ importId, config: record.config });
|
||||||
importId,
|
|
||||||
config: record.config,
|
|
||||||
});
|
|
||||||
|
|
||||||
type ValidStep = Exclude<ImportSteps, 'failed' | 'completed'>;
|
|
||||||
const steps: Record<ValidStep, number> = {
|
|
||||||
loading: 0,
|
|
||||||
generating_session_ids: 1,
|
|
||||||
creating_sessions: 2,
|
|
||||||
moving: 3,
|
|
||||||
backfilling_sessions: 4,
|
|
||||||
};
|
|
||||||
|
|
||||||
jobLogger.info('Starting import job');
|
jobLogger.info('Starting import job');
|
||||||
|
|
||||||
const providerInstance = createProvider(record, jobLogger);
|
const providerInstance = createProvider(record, jobLogger);
|
||||||
|
const shouldGenerateSessionIds = providerInstance.shouldGenerateSessionIds();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Check if this is a resume operation
|
const isRetry = record.currentStep !== null;
|
||||||
const isNewImport = record.currentStep === null;
|
const hasReachedProduction =
|
||||||
|
isRetry && PRODUCTION_STEPS.includes(record.currentStep as string);
|
||||||
|
|
||||||
if (isNewImport) {
|
// -------------------------------------------------------
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
// STAGING PHASE: clean slate on failure, run from scratch
|
||||||
step: 'loading',
|
// -------------------------------------------------------
|
||||||
});
|
if (!hasReachedProduction) {
|
||||||
} else {
|
if (isRetry) {
|
||||||
jobLogger.info('Resuming import from previous state', {
|
jobLogger.info(
|
||||||
currentStep: record.currentStep,
|
'Retry detected before production phase — cleaning staging data'
|
||||||
currentBatch: record.currentBatch,
|
);
|
||||||
});
|
await cleanupStagingData(importId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to get a precomputed total for better progress reporting
|
// Phase 1: Load events into staging
|
||||||
|
await updateImportStatus(jobLogger, job, importId, { step: 'loading' });
|
||||||
|
|
||||||
const totalEvents = await providerInstance
|
const totalEvents = await providerInstance
|
||||||
.getTotalEventsCount()
|
.getTotalEventsCount()
|
||||||
.catch(() => -1);
|
.catch(() => -1);
|
||||||
let processedEvents = record.processedEvents;
|
let processedEvents = 0;
|
||||||
|
const eventBatch: IClickhouseEvent[] = [];
|
||||||
|
|
||||||
const resumeLoadingFrom =
|
for await (const rawEvent of providerInstance.parseSource()) {
|
||||||
(record.currentStep === 'loading' && record.currentBatch) || undefined;
|
|
||||||
|
|
||||||
const resumeGeneratingSessionIdsFrom =
|
|
||||||
(record.currentStep === 'generating_session_ids' &&
|
|
||||||
record.currentBatch) ||
|
|
||||||
undefined;
|
|
||||||
|
|
||||||
const resumeCreatingSessionsFrom =
|
|
||||||
(record.currentStep === 'creating_sessions' && record.currentBatch) ||
|
|
||||||
undefined;
|
|
||||||
|
|
||||||
const resumeMovingFrom =
|
|
||||||
(record.currentStep === 'moving' && record.currentBatch) || undefined;
|
|
||||||
|
|
||||||
const resumeBackfillingSessionsFrom =
|
|
||||||
(record.currentStep === 'backfilling_sessions' && record.currentBatch) ||
|
|
||||||
undefined;
|
|
||||||
|
|
||||||
// Example:
|
|
||||||
// shouldRunStep(0) // currStep = 2 (should not run)
|
|
||||||
// shouldRunStep(1) // currStep = 2 (should not run)
|
|
||||||
// shouldRunStep(2) // currStep = 2 (should run)
|
|
||||||
// shouldRunStep(3) // currStep = 2 (should run)
|
|
||||||
const shouldRunStep = (step: ValidStep) => {
|
|
||||||
if (isNewImport) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const stepToRunIndex = steps[step];
|
|
||||||
const currentStepIndex = steps[record.currentStep as ValidStep];
|
|
||||||
return stepToRunIndex >= currentStepIndex;
|
|
||||||
};
|
|
||||||
|
|
||||||
async function whileBounds(
|
|
||||||
from: string | undefined,
|
|
||||||
callback: (from: string, to: string) => Promise<void>,
|
|
||||||
) {
|
|
||||||
const bounds = await getImportDateBounds(importId, from);
|
|
||||||
if (bounds.min && bounds.max) {
|
|
||||||
const start = new Date(bounds.min);
|
|
||||||
const end = new Date(bounds.max);
|
|
||||||
let cursor = new Date(start);
|
|
||||||
while (cursor < end) {
|
|
||||||
const next = new Date(cursor);
|
|
||||||
next.setDate(next.getDate() + 1);
|
|
||||||
await callback(
|
|
||||||
formatClickhouseDate(cursor, true),
|
|
||||||
formatClickhouseDate(next, true),
|
|
||||||
);
|
|
||||||
cursor = next;
|
|
||||||
|
|
||||||
// Yield control back to event loop after processing each day
|
|
||||||
await yieldToEventLoop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Phase 1: Fetch & Transform - Process events in batches
|
|
||||||
if (shouldRunStep('loading')) {
|
|
||||||
const eventBatch: any = [];
|
|
||||||
for await (const rawEvent of providerInstance.parseSource(
|
|
||||||
resumeLoadingFrom,
|
|
||||||
)) {
|
|
||||||
// Validate event
|
|
||||||
if (
|
if (
|
||||||
!providerInstance.validate(
|
!providerInstance.validate(
|
||||||
// @ts-expect-error
|
// @ts-expect-error -- provider-specific raw type
|
||||||
rawEvent,
|
rawEvent
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
jobLogger.warn('Skipping invalid event', { rawEvent });
|
jobLogger.warn('Skipping invalid event', { rawEvent });
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
eventBatch.push(rawEvent);
|
const transformed: IClickhouseEvent = providerInstance.transformEvent(
|
||||||
|
// @ts-expect-error -- provider-specific raw type
|
||||||
|
rawEvent
|
||||||
|
);
|
||||||
|
|
||||||
|
// Session IDs for providers that need them (e.g. Mixpanel) are generated
|
||||||
|
// in generateGapBasedSessionIds after loading, using gap-based logic.
|
||||||
|
eventBatch.push(transformed);
|
||||||
|
|
||||||
// Process batch when it reaches the batch size
|
|
||||||
if (eventBatch.length >= BATCH_SIZE) {
|
if (eventBatch.length >= BATCH_SIZE) {
|
||||||
jobLogger.info('Processing batch', { batchSize: eventBatch.length });
|
await insertImportBatch(eventBatch, importId);
|
||||||
|
|
||||||
const transformedEvents: IClickhouseEvent[] = eventBatch.map(
|
|
||||||
(
|
|
||||||
// @ts-expect-error
|
|
||||||
event,
|
|
||||||
) => providerInstance!.transformEvent(event),
|
|
||||||
);
|
|
||||||
|
|
||||||
await insertImportBatch(transformedEvents, importId);
|
|
||||||
|
|
||||||
processedEvents += eventBatch.length;
|
processedEvents += eventBatch.length;
|
||||||
eventBatch.length = 0;
|
|
||||||
|
|
||||||
const createdAt = new Date(transformedEvents[0]?.created_at || '')
|
const batchDate = new Date(eventBatch[0]?.created_at || '')
|
||||||
.toISOString()
|
.toISOString()
|
||||||
.split('T')[0];
|
.split('T')[0];
|
||||||
|
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'loading',
|
step: 'loading',
|
||||||
batch: createdAt,
|
batch: batchDate,
|
||||||
totalEvents,
|
totalEvents,
|
||||||
processedEvents,
|
processedEvents,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Yield control back to event loop after processing each batch
|
eventBatch.length = 0;
|
||||||
await yieldToEventLoop();
|
await yieldToEventLoop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process remaining events in the last batch
|
|
||||||
if (eventBatch.length > 0) {
|
if (eventBatch.length > 0) {
|
||||||
const transformedEvents = eventBatch.map(
|
await insertImportBatch(eventBatch, importId);
|
||||||
(
|
|
||||||
// @ts-expect-error
|
|
||||||
event,
|
|
||||||
) => providerInstance!.transformEvent(event),
|
|
||||||
);
|
|
||||||
|
|
||||||
await insertImportBatch(transformedEvents, importId);
|
|
||||||
|
|
||||||
processedEvents += eventBatch.length;
|
processedEvents += eventBatch.length;
|
||||||
eventBatch.length = 0;
|
|
||||||
|
|
||||||
const createdAt = new Date(transformedEvents[0]?.created_at || '')
|
const batchDate = new Date(eventBatch[0]?.created_at || '')
|
||||||
.toISOString()
|
.toISOString()
|
||||||
.split('T')[0];
|
.split('T')[0];
|
||||||
|
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'loading',
|
step: 'loading',
|
||||||
batch: createdAt,
|
batch: batchDate,
|
||||||
totalEvents,
|
totalEvents,
|
||||||
processedEvents,
|
processedEvents,
|
||||||
});
|
});
|
||||||
|
eventBatch.length = 0;
|
||||||
// Yield control back to event loop after processing final batch
|
|
||||||
await yieldToEventLoop();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase 2: Generate session IDs if provider requires it
|
jobLogger.info('Loading complete', { processedEvents });
|
||||||
|
|
||||||
|
// Phase 1b: Load user profiles (Mixpanel only)
|
||||||
|
const profileBatchSize = 5000;
|
||||||
if (
|
if (
|
||||||
shouldRunStep('generating_session_ids') &&
|
'streamProfiles' in providerInstance &&
|
||||||
providerInstance.shouldGenerateSessionIds()
|
typeof (providerInstance as MixpanelProvider).streamProfiles ===
|
||||||
|
'function'
|
||||||
) {
|
) {
|
||||||
await whileBounds(resumeGeneratingSessionIdsFrom, async (from) => {
|
|
||||||
console.log('Generating session IDs', { from });
|
|
||||||
await generateSessionIds(importId, from);
|
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'generating_session_ids',
|
step: 'loading_profiles',
|
||||||
batch: from,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Yield control back to event loop after processing each day
|
const profileBatch: IClickhouseProfile[] = [];
|
||||||
|
let processedProfiles = 0;
|
||||||
|
|
||||||
|
for await (const rawProfile of (
|
||||||
|
providerInstance as MixpanelProvider
|
||||||
|
).streamProfiles()) {
|
||||||
|
const profile = (
|
||||||
|
providerInstance as MixpanelProvider
|
||||||
|
).transformProfile(rawProfile);
|
||||||
|
profileBatch.push(profile);
|
||||||
|
|
||||||
|
if (profileBatch.length >= profileBatchSize) {
|
||||||
|
await insertProfilesBatch(profileBatch, record.projectId);
|
||||||
|
processedProfiles += profileBatch.length;
|
||||||
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
|
step: 'loading_profiles',
|
||||||
|
processedProfiles,
|
||||||
|
});
|
||||||
|
profileBatch.length = 0;
|
||||||
await yieldToEventLoop();
|
await yieldToEventLoop();
|
||||||
});
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (profileBatch.length > 0) {
|
||||||
|
await insertProfilesBatch(profileBatch, record.projectId);
|
||||||
|
processedProfiles += profileBatch.length;
|
||||||
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
|
step: 'loading_profiles',
|
||||||
|
processedProfiles,
|
||||||
|
totalProfiles: processedProfiles,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
jobLogger.info('Profile loading complete', { processedProfiles });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: Generate gap-based session IDs (Mixpanel etc.)
|
||||||
|
if (shouldGenerateSessionIds) {
|
||||||
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
|
step: 'generating_sessions',
|
||||||
|
});
|
||||||
|
await generateGapBasedSessionIds(importId);
|
||||||
|
await yieldToEventLoop();
|
||||||
jobLogger.info('Session ID generation complete');
|
jobLogger.info('Session ID generation complete');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase 3-5: Process in daily batches for robustness
|
// Phase 3: Create session_start / session_end events
|
||||||
|
|
||||||
if (shouldRunStep('creating_sessions')) {
|
|
||||||
await whileBounds(resumeCreatingSessionsFrom, async (from) => {
|
|
||||||
await createSessionsStartEndEvents(importId, from);
|
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'creating_sessions',
|
step: 'creating_sessions',
|
||||||
batch: from,
|
batch: 'all sessions',
|
||||||
});
|
});
|
||||||
|
await createSessionsStartEndEvents(importId);
|
||||||
// Yield control back to event loop after processing each day
|
|
||||||
await yieldToEventLoop();
|
await yieldToEventLoop();
|
||||||
});
|
|
||||||
|
jobLogger.info('Session event creation complete');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shouldRunStep('moving')) {
|
// -------------------------------------------------------
|
||||||
await whileBounds(resumeMovingFrom, async (from) => {
|
// PRODUCTION PHASE: resume-safe, track progress per batch
|
||||||
await moveImportsToProduction(importId, from);
|
// -------------------------------------------------------
|
||||||
|
|
||||||
|
// Phase 3: Move staging events to production (per-day)
|
||||||
|
const resumeMovingFrom =
|
||||||
|
hasReachedProduction && record.currentStep === 'moving'
|
||||||
|
? (record.currentBatch ?? undefined)
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
// currentBatch is the last successfully completed day — resume from the next day to avoid re-inserting it
|
||||||
|
const moveFromDate = (() => {
|
||||||
|
if (!resumeMovingFrom) return undefined;
|
||||||
|
const next = new Date(`${resumeMovingFrom}T12:00:00Z`);
|
||||||
|
next.setUTCDate(next.getUTCDate() + 1);
|
||||||
|
return next.toISOString().split('T')[0]!;
|
||||||
|
})();
|
||||||
|
|
||||||
|
const bounds = await getImportDateBounds(importId, moveFromDate);
|
||||||
|
if (bounds.min && bounds.max) {
|
||||||
|
const startDate = bounds.min.split(' ')[0]!;
|
||||||
|
const endDate = bounds.max.split(' ')[0]!;
|
||||||
|
const cursor = new Date(`${startDate}T12:00:00Z`);
|
||||||
|
const end = new Date(`${endDate}T12:00:00Z`);
|
||||||
|
|
||||||
|
while (cursor <= end) {
|
||||||
|
const dateStr = cursor.toISOString().split('T')[0]!;
|
||||||
|
|
||||||
|
await moveImportsToProduction(importId, dateStr);
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'moving',
|
step: 'moving',
|
||||||
batch: from,
|
batch: dateStr,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Yield control back to event loop after processing each day
|
|
||||||
await yieldToEventLoop();
|
await yieldToEventLoop();
|
||||||
});
|
cursor.setUTCDate(cursor.getUTCDate() + 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shouldRunStep('backfilling_sessions')) {
|
jobLogger.info('Move to production complete');
|
||||||
await whileBounds(resumeBackfillingSessionsFrom, async (from) => {
|
|
||||||
await backfillSessionsToProduction(importId, from);
|
// Phase 4: Backfill sessions table
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'backfilling_sessions',
|
step: 'backfilling_sessions',
|
||||||
batch: from,
|
batch: 'all sessions',
|
||||||
});
|
});
|
||||||
|
await backfillSessionsToProduction(importId);
|
||||||
// Yield control back to event loop after processing each day
|
|
||||||
await yieldToEventLoop();
|
await yieldToEventLoop();
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
await markImportComplete(importId);
|
jobLogger.info('Session backfill complete');
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
|
||||||
step: 'completed',
|
|
||||||
});
|
|
||||||
jobLogger.info('Import marked as complete');
|
|
||||||
|
|
||||||
// Get final progress
|
// Done
|
||||||
const finalProgress = await getImportProgress(importId);
|
await updateImportStatus(jobLogger, job, importId, { step: 'completed' });
|
||||||
|
jobLogger.info('Import completed');
|
||||||
|
|
||||||
jobLogger.info('Import job completed successfully', {
|
return { success: true };
|
||||||
totalEvents: finalProgress.totalEvents,
|
|
||||||
insertedEvents: finalProgress.insertedEvents,
|
|
||||||
status: finalProgress.status,
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
success: true,
|
|
||||||
totalEvents: finalProgress.totalEvents,
|
|
||||||
processedEvents: finalProgress.insertedEvents,
|
|
||||||
};
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
jobLogger.error('Import job failed', { error });
|
jobLogger.error('Import job failed', { error });
|
||||||
|
|
||||||
// Mark import as failed
|
|
||||||
try {
|
try {
|
||||||
const errorMsg = error instanceof Error ? error.message : 'Unknown error';
|
const errorMsg = error instanceof Error ? error.message : 'Unknown error';
|
||||||
await updateImportStatus(jobLogger, job, importId, {
|
await updateImportStatus(jobLogger, job, importId, {
|
||||||
step: 'failed',
|
step: 'failed',
|
||||||
errorMessage: errorMsg,
|
errorMessage: errorMsg,
|
||||||
});
|
});
|
||||||
jobLogger.warn('Import marked as failed', { error: errorMsg });
|
|
||||||
} catch (markError) {
|
} catch (markError) {
|
||||||
jobLogger.error('Failed to mark import as failed', { error, markError });
|
jobLogger.error('Failed to mark import as failed', { error, markError });
|
||||||
}
|
}
|
||||||
@@ -320,7 +270,7 @@ export async function importJob(job: Job<ImportQueuePayload>) {
|
|||||||
|
|
||||||
function createProvider(
|
function createProvider(
|
||||||
record: Prisma.ImportGetPayload<{ include: { project: true } }>,
|
record: Prisma.ImportGetPayload<{ include: { project: true } }>,
|
||||||
jobLogger: ILogger,
|
jobLogger: ILogger
|
||||||
) {
|
) {
|
||||||
const config = record.config;
|
const config = record.config;
|
||||||
switch (config.provider) {
|
switch (config.provider) {
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
export * from './src/prisma-client';
|
export * from './src/prisma-client';
|
||||||
export * from './src/clickhouse/client';
|
export * from './src/clickhouse/client';
|
||||||
export * from './src/clickhouse/csv';
|
|
||||||
export * from './src/sql-builder';
|
export * from './src/sql-builder';
|
||||||
export * from './src/services/chart.service';
|
export * from './src/services/chart.service';
|
||||||
export * from './src/engine';
|
export * from './src/engine';
|
||||||
|
|||||||
@@ -1,11 +1,9 @@
|
|||||||
import { Readable } from 'node:stream';
|
|
||||||
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
|
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
|
||||||
import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
|
import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
|
||||||
import sqlstring from 'sqlstring';
|
|
||||||
|
|
||||||
import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config';
|
import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config';
|
||||||
import { createLogger } from '@openpanel/logger';
|
import { createLogger } from '@openpanel/logger';
|
||||||
import type { IInterval } from '@openpanel/validation';
|
import type { IInterval } from '@openpanel/validation';
|
||||||
|
import sqlstring from 'sqlstring';
|
||||||
|
|
||||||
export { createClient };
|
export { createClient };
|
||||||
|
|
||||||
@@ -68,8 +66,11 @@ export const TABLE_NAMES = {
|
|||||||
* Non-clustered mode = self-hosted environments
|
* Non-clustered mode = self-hosted environments
|
||||||
*/
|
*/
|
||||||
export function isClickhouseClustered(): boolean {
|
export function isClickhouseClustered(): boolean {
|
||||||
if (process.env.CLICKHOUSE_CLUSTER === 'true' || process.env.CLICKHOUSE_CLUSTER === '1') {
|
if (
|
||||||
return true
|
process.env.CLICKHOUSE_CLUSTER === 'true' ||
|
||||||
|
process.env.CLICKHOUSE_CLUSTER === '1'
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return !(
|
return !(
|
||||||
@@ -97,21 +98,21 @@ function getClickhouseSettings(): ClickHouseSettings {
|
|||||||
return {
|
return {
|
||||||
distributed_product_mode: 'allow',
|
distributed_product_mode: 'allow',
|
||||||
date_time_input_format: 'best_effort',
|
date_time_input_format: 'best_effort',
|
||||||
...(!process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
|
...(process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
|
||||||
? {
|
? {}
|
||||||
|
: {
|
||||||
query_plan_convert_any_join_to_semi_or_anti_join: 0,
|
query_plan_convert_any_join_to_semi_or_anti_join: 0,
|
||||||
}
|
}),
|
||||||
: {}),
|
|
||||||
...additionalSettings,
|
...additionalSettings,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
|
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
|
||||||
max_open_connections: 30,
|
max_open_connections: 30,
|
||||||
request_timeout: 300000,
|
request_timeout: 300_000,
|
||||||
keep_alive: {
|
keep_alive: {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
idle_socket_ttl: 60000,
|
idle_socket_ttl: 60_000,
|
||||||
},
|
},
|
||||||
compression: {
|
compression: {
|
||||||
request: true,
|
request: true,
|
||||||
@@ -138,7 +139,7 @@ const cleanQuery = (query?: string) =>
|
|||||||
export async function withRetry<T>(
|
export async function withRetry<T>(
|
||||||
operation: () => Promise<T>,
|
operation: () => Promise<T>,
|
||||||
maxRetries = 3,
|
maxRetries = 3,
|
||||||
baseDelay = 500,
|
baseDelay = 500
|
||||||
): Promise<T> {
|
): Promise<T> {
|
||||||
let lastError: Error | undefined;
|
let lastError: Error | undefined;
|
||||||
|
|
||||||
@@ -162,7 +163,7 @@ export async function withRetry<T>(
|
|||||||
`Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`,
|
`Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`,
|
||||||
{
|
{
|
||||||
error: error.message,
|
error: error.message,
|
||||||
},
|
}
|
||||||
);
|
);
|
||||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
continue;
|
continue;
|
||||||
@@ -213,7 +214,7 @@ export const ch = new Proxy(originalCh, {
|
|||||||
|
|
||||||
export async function chQueryWithMeta<T extends Record<string, any>>(
|
export async function chQueryWithMeta<T extends Record<string, any>>(
|
||||||
query: string,
|
query: string,
|
||||||
clickhouseSettings?: ClickHouseSettings,
|
clickhouseSettings?: ClickHouseSettings
|
||||||
): Promise<ResponseJSON<T>> {
|
): Promise<ResponseJSON<T>> {
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
const res = await ch.query({
|
const res = await ch.query({
|
||||||
@@ -249,44 +250,16 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
|
|||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function chInsertCSV(tableName: string, rows: string[]) {
|
|
||||||
try {
|
|
||||||
const now = performance.now();
|
|
||||||
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
|
|
||||||
const csvStream = Readable.from(rows.join('\n'), {
|
|
||||||
objectMode: false,
|
|
||||||
});
|
|
||||||
|
|
||||||
await ch.insert({
|
|
||||||
table: tableName,
|
|
||||||
values: csvStream,
|
|
||||||
format: 'CSV',
|
|
||||||
clickhouse_settings: {
|
|
||||||
format_csv_allow_double_quotes: 1,
|
|
||||||
format_csv_allow_single_quotes: 0,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.info('CSV Insert successful', {
|
|
||||||
elapsed: performance.now() - now,
|
|
||||||
rows: rows.length,
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('CSV Insert failed:', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function chQuery<T extends Record<string, any>>(
|
export async function chQuery<T extends Record<string, any>>(
|
||||||
query: string,
|
query: string,
|
||||||
clickhouseSettings?: ClickHouseSettings,
|
clickhouseSettings?: ClickHouseSettings
|
||||||
): Promise<T[]> {
|
): Promise<T[]> {
|
||||||
return (await chQueryWithMeta<T>(query, clickhouseSettings)).data;
|
return (await chQueryWithMeta<T>(query, clickhouseSettings)).data;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function formatClickhouseDate(
|
export function formatClickhouseDate(
|
||||||
date: Date | string,
|
date: Date | string,
|
||||||
skipTime = false,
|
skipTime = false
|
||||||
): string {
|
): string {
|
||||||
if (skipTime) {
|
if (skipTime) {
|
||||||
return new Date(date).toISOString().split('T')[0]!;
|
return new Date(date).toISOString().split('T')[0]!;
|
||||||
|
|||||||
@@ -1,53 +0,0 @@
|
|||||||
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
|
|
||||||
// Format: '{'key1':'value1','key2':'value2'}'
|
|
||||||
// Single quotes inside values must be escaped with backslash: \'
|
|
||||||
// We also need to escape newlines and control characters to prevent CSV parsing issues
|
|
||||||
const escapeMapValue = (str: string) => {
|
|
||||||
return str
|
|
||||||
.replace(/\\/g, '\\\\') // Escape backslashes first
|
|
||||||
.replace(/'/g, "\\'") // Escape single quotes
|
|
||||||
.replace(/\n/g, '\\n') // Escape newlines
|
|
||||||
.replace(/\r/g, '\\r') // Escape carriage returns
|
|
||||||
.replace(/\t/g, '\\t') // Escape tabs
|
|
||||||
.replace(/\0/g, '\\0'); // Escape null bytes
|
|
||||||
};
|
|
||||||
|
|
||||||
export const csvEscapeJson = (
|
|
||||||
value: Record<string, unknown> | null | undefined,
|
|
||||||
): string => {
|
|
||||||
if (value == null) return '';
|
|
||||||
|
|
||||||
// Normalize to strings if your column is Map(String,String)
|
|
||||||
const normalized: Record<string, string> = Object.fromEntries(
|
|
||||||
Object.entries(value).map(([k, v]) => [
|
|
||||||
String(k),
|
|
||||||
v == null ? '' : String(v),
|
|
||||||
]),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
|
|
||||||
if (Object.keys(normalized).length === 0) return '{}';
|
|
||||||
|
|
||||||
const pairs = Object.entries(normalized)
|
|
||||||
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
|
|
||||||
.join(',');
|
|
||||||
|
|
||||||
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
|
|
||||||
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
|
|
||||||
return csvEscapeField(`{${pairs}}`);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
|
|
||||||
// Double quotes inside must be doubled (""), per CSV standard
|
|
||||||
export const csvEscapeField = (value: string | number): string => {
|
|
||||||
const str = String(value);
|
|
||||||
|
|
||||||
// If field contains commas, quotes, or newlines, it must be quoted
|
|
||||||
if (/[,"\n\r]/.test(str)) {
|
|
||||||
// Escape double quotes by doubling them
|
|
||||||
const escaped = str.replace(/"/g, '""');
|
|
||||||
return `"${escaped}"`;
|
|
||||||
}
|
|
||||||
|
|
||||||
return str;
|
|
||||||
};
|
|
||||||
@@ -1,15 +1,14 @@
|
|||||||
|
import { createHash } from 'node:crypto';
|
||||||
import type { ILogger } from '@openpanel/logger';
|
import type { ILogger } from '@openpanel/logger';
|
||||||
import sqlstring from 'sqlstring';
|
|
||||||
import {
|
import {
|
||||||
TABLE_NAMES,
|
|
||||||
ch,
|
ch,
|
||||||
chInsertCSV,
|
|
||||||
convertClickhouseDateToJs,
|
convertClickhouseDateToJs,
|
||||||
formatClickhouseDate,
|
formatClickhouseDate,
|
||||||
getReplicatedTableName,
|
getReplicatedTableName,
|
||||||
|
TABLE_NAMES,
|
||||||
} from '../clickhouse/client';
|
} from '../clickhouse/client';
|
||||||
import { csvEscapeField, csvEscapeJson } from '../clickhouse/csv';
|
import { db, type Prisma } from '../prisma-client';
|
||||||
import { type Prisma, db } from '../prisma-client';
|
import type { IClickhouseProfile } from './profile.service';
|
||||||
import type { IClickhouseEvent } from './event.service';
|
import type { IClickhouseEvent } from './event.service';
|
||||||
|
|
||||||
export interface ImportStageResult {
|
export interface ImportStageResult {
|
||||||
@@ -18,11 +17,89 @@ export interface ImportStageResult {
|
|||||||
insertedEvents: number;
|
insertedEvents: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ImportProgress {
|
const SESSION_GAP_MS = 30 * 60 * 1000; // 30 minutes
|
||||||
importId: string;
|
|
||||||
totalEvents: number;
|
/**
|
||||||
insertedEvents: number;
|
* Generate gap-based session IDs for events that have none.
|
||||||
status: 'pending' | 'processing' | 'processed' | 'failed';
|
* Streams events from staging (sorted by device_id, created_at), assigns a new
|
||||||
|
* session when gap > 30 min, re-inserts with session_id, then deletes old rows.
|
||||||
|
*/
|
||||||
|
export async function generateGapBasedSessionIds(
|
||||||
|
importId: string
|
||||||
|
): Promise<void> {
|
||||||
|
let currentDeviceId = '';
|
||||||
|
let currentSessionId = '';
|
||||||
|
let currentLastTime = 0;
|
||||||
|
let currentCounter = -1;
|
||||||
|
const BATCH_SIZE = 5000;
|
||||||
|
const batch: IClickhouseEvent[] = [];
|
||||||
|
|
||||||
|
const result = await ch.query({
|
||||||
|
query: `
|
||||||
|
SELECT id, name, sdk_name, sdk_version, device_id, profile_id, project_id,
|
||||||
|
session_id, path, origin, referrer, referrer_name, referrer_type,
|
||||||
|
duration, properties, created_at, country, city, region,
|
||||||
|
longitude, latitude, os, os_version, browser, browser_version,
|
||||||
|
device, brand, model, imported_at
|
||||||
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
|
WHERE import_id = {importId:String}
|
||||||
|
AND session_id = ''
|
||||||
|
AND device != 'server'
|
||||||
|
ORDER BY device_id, created_at
|
||||||
|
`,
|
||||||
|
query_params: { importId },
|
||||||
|
format: 'JSONEachRow',
|
||||||
|
});
|
||||||
|
|
||||||
|
const stream = result.stream();
|
||||||
|
for await (const rows of stream) {
|
||||||
|
for (const row of rows) {
|
||||||
|
const event = row.json() as IClickhouseEvent;
|
||||||
|
const time = new Date(event.created_at).getTime();
|
||||||
|
|
||||||
|
if (event.device_id !== currentDeviceId) {
|
||||||
|
currentDeviceId = event.device_id;
|
||||||
|
currentSessionId = '';
|
||||||
|
currentLastTime = 0;
|
||||||
|
currentCounter = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!currentSessionId || time - currentLastTime > SESSION_GAP_MS) {
|
||||||
|
currentCounter++;
|
||||||
|
currentSessionId = createHash('md5')
|
||||||
|
.update(`${event.device_id}-${currentCounter}`)
|
||||||
|
.digest('hex')
|
||||||
|
.toLowerCase();
|
||||||
|
}
|
||||||
|
currentLastTime = time;
|
||||||
|
event.session_id = currentSessionId;
|
||||||
|
|
||||||
|
batch.push(event);
|
||||||
|
if (batch.length >= BATCH_SIZE) {
|
||||||
|
await insertImportBatch(batch, importId);
|
||||||
|
batch.length = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (batch.length > 0) {
|
||||||
|
await insertImportBatch(batch, importId);
|
||||||
|
}
|
||||||
|
|
||||||
|
const mutationTable = getReplicatedTableName(TABLE_NAMES.events_imports);
|
||||||
|
await ch.command({
|
||||||
|
query: `ALTER TABLE ${mutationTable} DELETE
|
||||||
|
WHERE import_id = {importId:String}
|
||||||
|
AND session_id = ''
|
||||||
|
AND device != 'server'`,
|
||||||
|
query_params: { importId },
|
||||||
|
clickhouse_settings: {
|
||||||
|
wait_end_of_query: 1,
|
||||||
|
mutations_sync: '2',
|
||||||
|
send_progress_in_http_headers: 1,
|
||||||
|
http_headers_progress_interval_ms: '50000',
|
||||||
|
},
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -30,55 +107,26 @@ export interface ImportProgress {
|
|||||||
*/
|
*/
|
||||||
export async function insertImportBatch(
|
export async function insertImportBatch(
|
||||||
events: IClickhouseEvent[],
|
events: IClickhouseEvent[],
|
||||||
importId: string,
|
importId: string
|
||||||
): Promise<ImportStageResult> {
|
): Promise<ImportStageResult> {
|
||||||
if (events.length === 0) {
|
if (events.length === 0) {
|
||||||
return { importId, totalEvents: 0, insertedEvents: 0 };
|
return { importId, totalEvents: 0, insertedEvents: 0 };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Important to have same order as events_imports table
|
const now = formatClickhouseDate(new Date());
|
||||||
// CSV format: properly quotes fields that need it
|
const rows = events.map((event) => ({
|
||||||
const csvRows = events.map((event) => {
|
...event,
|
||||||
// Properties need to be converted to JSON for Map(String, String)
|
import_id: importId,
|
||||||
// All fields must be CSV-escaped when joining with commas
|
import_status: 'pending',
|
||||||
const fields = [
|
imported_at: event.imported_at || now,
|
||||||
csvEscapeField(event.id || ''),
|
imported_at_meta: now,
|
||||||
csvEscapeField(event.name),
|
}));
|
||||||
csvEscapeField(event.sdk_name || ''),
|
|
||||||
csvEscapeField(event.sdk_version || ''),
|
|
||||||
csvEscapeField(event.device_id || ''),
|
|
||||||
csvEscapeField(event.profile_id || ''),
|
|
||||||
csvEscapeField(event.project_id || ''),
|
|
||||||
csvEscapeField(event.session_id || ''),
|
|
||||||
csvEscapeField(event.path),
|
|
||||||
csvEscapeField(event.origin || ''),
|
|
||||||
csvEscapeField(event.referrer || ''),
|
|
||||||
csvEscapeField(event.referrer_name || ''),
|
|
||||||
csvEscapeField(event.referrer_type || ''),
|
|
||||||
csvEscapeField(event.duration ?? 0),
|
|
||||||
csvEscapeJson(event.properties),
|
|
||||||
csvEscapeField(event.created_at),
|
|
||||||
csvEscapeField(event.country || ''),
|
|
||||||
csvEscapeField(event.city || ''),
|
|
||||||
csvEscapeField(event.region || ''),
|
|
||||||
csvEscapeField(event.longitude != null ? event.longitude : '\\N'),
|
|
||||||
csvEscapeField(event.latitude != null ? event.latitude : '\\N'),
|
|
||||||
csvEscapeField(event.os || ''),
|
|
||||||
csvEscapeField(event.os_version || ''),
|
|
||||||
csvEscapeField(event.browser || ''),
|
|
||||||
csvEscapeField(event.browser_version || ''),
|
|
||||||
csvEscapeField(event.device || ''),
|
|
||||||
csvEscapeField(event.brand || ''),
|
|
||||||
csvEscapeField(event.model || ''),
|
|
||||||
csvEscapeField('\\N'), // imported_at (Nullable)
|
|
||||||
csvEscapeField(importId),
|
|
||||||
csvEscapeField('pending'), // import_status
|
|
||||||
csvEscapeField(formatClickhouseDate(new Date())), // imported_at_meta (DateTime, not DateTime64, so no milliseconds)
|
|
||||||
];
|
|
||||||
return fields.join(',');
|
|
||||||
});
|
|
||||||
|
|
||||||
await chInsertCSV(TABLE_NAMES.events_imports, csvRows);
|
await ch.insert({
|
||||||
|
table: TABLE_NAMES.events_imports,
|
||||||
|
values: rows,
|
||||||
|
format: 'JSONEachRow',
|
||||||
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
importId,
|
importId,
|
||||||
@@ -88,44 +136,86 @@ export async function insertImportBatch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate deterministic session IDs for events that don't have them
|
* Insert a batch of profiles into the production profiles table.
|
||||||
* Uses 30-minute time windows to create consistent session IDs across imports
|
* Used by Mixpanel (and other providers) to import user profiles during an import job.
|
||||||
* Only processes events where device != 'server' and session_id = ''
|
|
||||||
*/
|
*/
|
||||||
export async function generateSessionIds(
|
export async function insertProfilesBatch(
|
||||||
importId: string,
|
profiles: IClickhouseProfile[],
|
||||||
from: string,
|
projectId: string
|
||||||
): Promise<void> {
|
): Promise<{ inserted: number }> {
|
||||||
const rangeWhere = [
|
if (profiles.length === 0) {
|
||||||
'import_id = {importId:String}',
|
return { inserted: 0 };
|
||||||
"import_status = 'pending'",
|
}
|
||||||
"device != 'server'",
|
|
||||||
"session_id = ''",
|
|
||||||
from ? 'toDate(created_at) = {from:String}' : '',
|
|
||||||
]
|
|
||||||
.filter(Boolean)
|
|
||||||
.join(' AND ');
|
|
||||||
|
|
||||||
// Use SQL to generate deterministic session IDs based on device_id + 30-min time windows
|
const normalized = profiles.map((p) => ({
|
||||||
// This ensures same events always get same session IDs regardless of import order
|
id: p.id,
|
||||||
// In clustered mode, we must use the replicated table for mutations
|
project_id: projectId,
|
||||||
|
first_name: p.first_name ?? '',
|
||||||
|
last_name: p.last_name ?? '',
|
||||||
|
email: p.email ?? '',
|
||||||
|
avatar: p.avatar ?? '',
|
||||||
|
is_external: p.is_external ?? true,
|
||||||
|
properties: Object.fromEntries(
|
||||||
|
Object.entries(p.properties || {}).filter(
|
||||||
|
(kv): kv is [string, string] => kv[1] != null && kv[1] !== ''
|
||||||
|
)
|
||||||
|
) as Record<string, string>,
|
||||||
|
created_at: p.created_at,
|
||||||
|
}));
|
||||||
|
|
||||||
|
await ch.insert({
|
||||||
|
table: TABLE_NAMES.profiles,
|
||||||
|
values: normalized,
|
||||||
|
format: 'JSONEachRow',
|
||||||
|
});
|
||||||
|
|
||||||
|
return { inserted: normalized.length };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete all staging data for an import. Used to get a clean slate on retry
|
||||||
|
* when the failure happened before moving data to production.
|
||||||
|
*/
|
||||||
|
export async function cleanupStagingData(importId: string): Promise<void> {
|
||||||
const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports);
|
const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports);
|
||||||
const updateQuery = `
|
|
||||||
ALTER TABLE ${mutationTableName}
|
|
||||||
UPDATE session_id = lower(hex(MD5(concat(
|
|
||||||
device_id,
|
|
||||||
'-',
|
|
||||||
toString(toInt64(toUnixTimestamp(created_at) / 1800))
|
|
||||||
))))
|
|
||||||
WHERE ${rangeWhere}
|
|
||||||
`;
|
|
||||||
|
|
||||||
await ch.command({
|
await ch.command({
|
||||||
query: updateQuery,
|
query: `ALTER TABLE ${mutationTableName} DELETE WHERE import_id = {importId:String}`,
|
||||||
query_params: { importId, from },
|
query_params: { importId },
|
||||||
clickhouse_settings: {
|
clickhouse_settings: {
|
||||||
wait_end_of_query: 1,
|
wait_end_of_query: 1,
|
||||||
mutations_sync: '2', // Wait for mutation to complete on all replicas (critical!)
|
mutations_sync: '2',
|
||||||
send_progress_in_http_headers: 1,
|
send_progress_in_http_headers: 1,
|
||||||
http_headers_progress_interval_ms: '50000',
|
http_headers_progress_interval_ms: '50000',
|
||||||
},
|
},
|
||||||
@@ -133,46 +223,69 @@ export async function generateSessionIds(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reconstruct sessions using SQL-based logic
|
* Reconstruct sessions across ALL dates for the import.
|
||||||
* This identifies session boundaries and creates session_start/session_end events
|
* Each session_id gets exactly one session_start and one session_end,
|
||||||
* session_start inherits all properties from the first event in the session
|
* even if the session spans midnight.
|
||||||
* session_end inherits all properties from the last event in the session and calculates duration
|
*
|
||||||
|
* Batches by fetching distinct session_ids first, then running the
|
||||||
|
* heavy aggregation only for that batch of IDs.
|
||||||
*/
|
*/
|
||||||
export async function createSessionsStartEndEvents(
|
export async function createSessionsStartEndEvents(
|
||||||
importId: string,
|
importId: string
|
||||||
from: string,
|
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
// First, let's identify session boundaries and get first/last events for each session
|
const SESSION_BATCH_SIZE = 5000;
|
||||||
const rangeWhere = [
|
let lastSessionId = '';
|
||||||
'import_id = {importId:String}',
|
|
||||||
"import_status = 'pending'",
|
const baseWhere = [
|
||||||
"session_id != ''", // Only process events that have session IDs
|
'import_id = {importId:String}',
|
||||||
'toDate(created_at) = {from:String}',
|
"session_id != ''",
|
||||||
]
|
"name NOT IN ('session_start', 'session_end')",
|
||||||
.filter(Boolean)
|
].join(' AND ');
|
||||||
.join(' AND ');
|
|
||||||
|
while (true) {
|
||||||
|
const idsResult = await ch.query({
|
||||||
|
query: `
|
||||||
|
SELECT DISTINCT session_id
|
||||||
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
|
WHERE ${baseWhere}
|
||||||
|
AND session_id > {lastSessionId:String}
|
||||||
|
ORDER BY session_id
|
||||||
|
LIMIT {limit:UInt32}
|
||||||
|
`,
|
||||||
|
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
|
||||||
|
format: 'JSONEachRow',
|
||||||
|
});
|
||||||
|
|
||||||
|
const idRows = (await idsResult.json()) as Array<{ session_id: string }>;
|
||||||
|
if (idRows.length === 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sessionIds = idRows.map((r) => r.session_id);
|
||||||
|
|
||||||
// Use window functions to efficiently get first event (all fields) and last event (only changing fields)
|
|
||||||
// session_end only needs: properties, path, origin, created_at - the rest can be inherited from session_start
|
|
||||||
const sessionEventsQuery = `
|
const sessionEventsQuery = `
|
||||||
SELECT
|
SELECT
|
||||||
device_id,
|
device_id,
|
||||||
session_id,
|
session_id,
|
||||||
project_id,
|
project_id,
|
||||||
profile_id,
|
if(
|
||||||
|
any(nullIf(profile_id, device_id)) IS NULL,
|
||||||
|
any(profile_id),
|
||||||
|
any(nullIf(profile_id, device_id))
|
||||||
|
) AS profile_id,
|
||||||
argMin((path, origin, referrer, referrer_name, referrer_type, properties, created_at, country, city, region, longitude, latitude, os, os_version, browser, browser_version, device, brand, model), created_at) AS first_event,
|
argMin((path, origin, referrer, referrer_name, referrer_type, properties, created_at, country, city, region, longitude, latitude, os, os_version, browser, browser_version, device, brand, model), created_at) AS first_event,
|
||||||
argMax((path, origin, properties, created_at), created_at) AS last_event_fields,
|
argMax((path, origin, properties, created_at), created_at) AS last_event_fields,
|
||||||
min(created_at) AS first_timestamp,
|
min(created_at) AS first_timestamp,
|
||||||
max(created_at) AS last_timestamp
|
max(created_at) AS last_timestamp
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
WHERE ${rangeWhere}
|
WHERE ${baseWhere}
|
||||||
AND name NOT IN ('session_start', 'session_end')
|
AND session_id IN ({sessionIds:Array(String)})
|
||||||
GROUP BY session_id, device_id, project_id, profile_id
|
GROUP BY session_id, device_id, project_id
|
||||||
`;
|
`;
|
||||||
|
|
||||||
const sessionEventsResult = await ch.query({
|
const sessionEventsResult = await ch.query({
|
||||||
query: sessionEventsQuery,
|
query: sessionEventsQuery,
|
||||||
query_params: { importId, from },
|
query_params: { importId, sessionIds },
|
||||||
format: 'JSONEachRow',
|
format: 'JSONEachRow',
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -182,14 +295,11 @@ export async function createSessionsStartEndEvents(
|
|||||||
project_id: string;
|
project_id: string;
|
||||||
profile_id: string;
|
profile_id: string;
|
||||||
first_event: [
|
first_event: [
|
||||||
// string, // id
|
|
||||||
// string, // name
|
|
||||||
string, // path
|
string, // path
|
||||||
string, // origin
|
string, // origin
|
||||||
string, // referrer
|
string, // referrer
|
||||||
string, // referrer_name
|
string, // referrer_name
|
||||||
string, // referrer_type
|
string, // referrer_type
|
||||||
// number, // duration
|
|
||||||
Record<string, unknown>, // properties
|
Record<string, unknown>, // properties
|
||||||
string, // created_at
|
string, // created_at
|
||||||
string, // country
|
string, // country
|
||||||
@@ -204,9 +314,6 @@ export async function createSessionsStartEndEvents(
|
|||||||
string, // device
|
string, // device
|
||||||
string, // brand
|
string, // brand
|
||||||
string, // model
|
string, // model
|
||||||
// string, // sdk_name
|
|
||||||
// string, // sdk_version
|
|
||||||
// string, // imported_at
|
|
||||||
];
|
];
|
||||||
last_event_fields: [
|
last_event_fields: [
|
||||||
string, // path
|
string, // path
|
||||||
@@ -218,22 +325,23 @@ export async function createSessionsStartEndEvents(
|
|||||||
last_timestamp: string;
|
last_timestamp: string;
|
||||||
}>;
|
}>;
|
||||||
|
|
||||||
// Create session_start and session_end events
|
|
||||||
const sessionEvents: IClickhouseEvent[] = [];
|
const sessionEvents: IClickhouseEvent[] = [];
|
||||||
|
|
||||||
|
const adjustTimestamp = (timestamp: string, offsetMs: number): string => {
|
||||||
|
const date = convertClickhouseDateToJs(timestamp);
|
||||||
|
date.setTime(date.getTime() + offsetMs);
|
||||||
|
return formatClickhouseDate(date);
|
||||||
|
};
|
||||||
|
|
||||||
for (const session of sessionData) {
|
for (const session of sessionData) {
|
||||||
// Destructure first event tuple (all fields)
|
|
||||||
const [
|
const [
|
||||||
// firstId,
|
|
||||||
// firstName,
|
|
||||||
firstPath,
|
firstPath,
|
||||||
firstOrigin,
|
firstOrigin,
|
||||||
firstReferrer,
|
firstReferrer,
|
||||||
firstReferrerName,
|
firstReferrerName,
|
||||||
firstReferrerType,
|
firstReferrerType,
|
||||||
// firstDuration,
|
|
||||||
firstProperties,
|
firstProperties,
|
||||||
firstCreatedAt,
|
_firstCreatedAt,
|
||||||
firstCountry,
|
firstCountry,
|
||||||
firstCity,
|
firstCity,
|
||||||
firstRegion,
|
firstRegion,
|
||||||
@@ -246,31 +354,15 @@ export async function createSessionsStartEndEvents(
|
|||||||
firstDevice,
|
firstDevice,
|
||||||
firstBrand,
|
firstBrand,
|
||||||
firstModel,
|
firstModel,
|
||||||
// firstSdkName,
|
|
||||||
// firstSdkVersion,
|
|
||||||
// firstImportedAt,
|
|
||||||
] = session.first_event;
|
] = session.first_event;
|
||||||
|
|
||||||
// Destructure last event fields (only the changing ones)
|
const [lastPath, lastOrigin, lastProperties, _lastCreatedAt] =
|
||||||
const [lastPath, lastOrigin, lastProperties, lastCreatedAt] =
|
|
||||||
session.last_event_fields;
|
session.last_event_fields;
|
||||||
|
|
||||||
// Calculate duration in milliseconds
|
|
||||||
// Parse timestamps as Date objects to calculate duration
|
|
||||||
const firstTime = new Date(session.first_timestamp).getTime();
|
const firstTime = new Date(session.first_timestamp).getTime();
|
||||||
const lastTime = new Date(session.last_timestamp).getTime();
|
const lastTime = new Date(session.last_timestamp).getTime();
|
||||||
const durationMs = Math.max(0, lastTime - firstTime); // Ensure non-negative duration
|
const durationMs = Math.max(0, lastTime - firstTime);
|
||||||
|
|
||||||
// Helper function to adjust timestamp by milliseconds without timezone conversion
|
|
||||||
const adjustTimestamp = (timestamp: string, offsetMs: number): string => {
|
|
||||||
// Parse the timestamp, adjust it, and format back to ClickHouse format
|
|
||||||
const date = convertClickhouseDateToJs(timestamp);
|
|
||||||
date.setTime(date.getTime() + offsetMs);
|
|
||||||
return formatClickhouseDate(date);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Create session_start event - inherit everything from first event but change name
|
|
||||||
// Set created_at to 1 second before the first event
|
|
||||||
sessionEvents.push({
|
sessionEvents.push({
|
||||||
id: crypto.randomUUID(),
|
id: crypto.randomUUID(),
|
||||||
name: 'session_start',
|
name: 'session_start',
|
||||||
@@ -283,12 +375,12 @@ export async function createSessionsStartEndEvents(
|
|||||||
referrer: firstReferrer,
|
referrer: firstReferrer,
|
||||||
referrer_name: firstReferrerName,
|
referrer_name: firstReferrerName,
|
||||||
referrer_type: firstReferrerType,
|
referrer_type: firstReferrerType,
|
||||||
duration: 0, // session_start always has 0 duration
|
duration: 0,
|
||||||
properties: firstProperties as Record<
|
properties: firstProperties as Record<
|
||||||
string,
|
string,
|
||||||
string | number | boolean | null | undefined
|
string | number | boolean | null | undefined
|
||||||
>,
|
>,
|
||||||
created_at: adjustTimestamp(session.first_timestamp, -1000), // 1 second before first event
|
created_at: adjustTimestamp(session.first_timestamp, -1000),
|
||||||
country: firstCountry,
|
country: firstCountry,
|
||||||
city: firstCity,
|
city: firstCity,
|
||||||
region: firstRegion,
|
region: firstRegion,
|
||||||
@@ -306,8 +398,6 @@ export async function createSessionsStartEndEvents(
|
|||||||
sdk_version: '1.0.0',
|
sdk_version: '1.0.0',
|
||||||
});
|
});
|
||||||
|
|
||||||
// Create session_end event - inherit most from session_start, but use last event's path, origin, properties
|
|
||||||
// Set created_at to 1 second after the last event
|
|
||||||
sessionEvents.push({
|
sessionEvents.push({
|
||||||
id: crypto.randomUUID(),
|
id: crypto.randomUUID(),
|
||||||
name: 'session_end',
|
name: 'session_end',
|
||||||
@@ -315,133 +405,74 @@ export async function createSessionsStartEndEvents(
|
|||||||
profile_id: session.profile_id,
|
profile_id: session.profile_id,
|
||||||
project_id: session.project_id,
|
project_id: session.project_id,
|
||||||
session_id: session.session_id,
|
session_id: session.session_id,
|
||||||
path: lastPath, // From last event
|
path: lastPath,
|
||||||
origin: lastOrigin, // From last event
|
origin: lastOrigin,
|
||||||
referrer: firstReferrer, // Same as session_start
|
referrer: firstReferrer,
|
||||||
referrer_name: firstReferrerName, // Same as session_start
|
referrer_name: firstReferrerName,
|
||||||
referrer_type: firstReferrerType, // Same as session_start
|
referrer_type: firstReferrerType,
|
||||||
duration: durationMs,
|
duration: durationMs,
|
||||||
properties: lastProperties as Record<
|
properties: lastProperties as Record<
|
||||||
string,
|
string,
|
||||||
string | number | boolean | null | undefined
|
string | number | boolean | null | undefined
|
||||||
>, // From last event
|
>,
|
||||||
created_at: adjustTimestamp(session.last_timestamp, 500), // 1 second after last event
|
created_at: adjustTimestamp(session.last_timestamp, 1000),
|
||||||
country: firstCountry, // Same as session_start
|
country: firstCountry,
|
||||||
city: firstCity, // Same as session_start
|
city: firstCity,
|
||||||
region: firstRegion, // Same as session_start
|
region: firstRegion,
|
||||||
longitude: firstLongitude, // Same as session_start
|
longitude: firstLongitude,
|
||||||
latitude: firstLatitude, // Same as session_start
|
latitude: firstLatitude,
|
||||||
os: firstOs, // Same as session_start
|
os: firstOs,
|
||||||
os_version: firstOsVersion, // Same as session_start
|
os_version: firstOsVersion,
|
||||||
browser: firstBrowser, // Same as session_start
|
browser: firstBrowser,
|
||||||
browser_version: firstBrowserVersion, // Same as session_start
|
browser_version: firstBrowserVersion,
|
||||||
device: firstDevice, // Same as session_start
|
device: firstDevice,
|
||||||
brand: firstBrand, // Same as session_start
|
brand: firstBrand,
|
||||||
model: firstModel, // Same as session_start
|
model: firstModel,
|
||||||
imported_at: new Date().toISOString(),
|
imported_at: new Date().toISOString(),
|
||||||
sdk_name: 'import-session-reconstruction',
|
sdk_name: 'import-session-reconstruction',
|
||||||
sdk_version: '1.0.0',
|
sdk_version: '1.0.0',
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert session events into imports table
|
|
||||||
if (sessionEvents.length > 0) {
|
if (sessionEvents.length > 0) {
|
||||||
await insertImportBatch(sessionEvents, importId);
|
await insertImportBatch(sessionEvents, importId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lastSessionId = idRows[idRows.length - 1]!.session_id;
|
||||||
|
if (idRows.length < SESSION_BATCH_SIZE) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Migrate all events from imports table to production events table
|
* Move events from staging to production events table.
|
||||||
* This includes both original events and generated session events
|
* Batched per-day using a simple date filter.
|
||||||
*/
|
*/
|
||||||
export async function moveImportsToProduction(
|
export async function moveImportsToProduction(
|
||||||
importId: string,
|
importId: string,
|
||||||
from: string,
|
from: string
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
// Build the WHERE clause for migration
|
|
||||||
// For session events (session_start/session_end), we don't filter by their created_at
|
|
||||||
// because they're created with adjusted timestamps (±1 second) that might fall outside
|
|
||||||
// the date range. Instead, we include them if their session_id has events in this range.
|
|
||||||
let whereClause = 'import_id = {importId:String}';
|
let whereClause = 'import_id = {importId:String}';
|
||||||
|
|
||||||
if (from) {
|
if (from) {
|
||||||
whereClause += ` AND (
|
whereClause += ' AND toDate(created_at) = {from:String}';
|
||||||
(toDate(created_at) = {from:String}) OR
|
|
||||||
(
|
|
||||||
name IN ('session_start', 'session_end') AND
|
|
||||||
session_id IN (
|
|
||||||
SELECT DISTINCT session_id
|
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
|
||||||
WHERE import_id = {importId:String}
|
|
||||||
AND toDate(created_at) = {from:String}
|
|
||||||
AND name NOT IN ('session_start', 'session_end')
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const migrationQuery = `
|
const migrationQuery = `
|
||||||
INSERT INTO ${TABLE_NAMES.events} (
|
INSERT INTO ${TABLE_NAMES.events} (
|
||||||
id,
|
id, name, sdk_name, sdk_version, device_id, profile_id, project_id,
|
||||||
name,
|
session_id, path, origin, referrer, referrer_name, referrer_type,
|
||||||
sdk_name,
|
duration, properties, created_at, country, city, region,
|
||||||
sdk_version,
|
longitude, latitude, os, os_version, browser, browser_version,
|
||||||
device_id,
|
device, brand, model, imported_at
|
||||||
profile_id,
|
|
||||||
project_id,
|
|
||||||
session_id,
|
|
||||||
path,
|
|
||||||
origin,
|
|
||||||
referrer,
|
|
||||||
referrer_name,
|
|
||||||
referrer_type,
|
|
||||||
duration,
|
|
||||||
properties,
|
|
||||||
created_at,
|
|
||||||
country,
|
|
||||||
city,
|
|
||||||
region,
|
|
||||||
longitude,
|
|
||||||
latitude,
|
|
||||||
os,
|
|
||||||
os_version,
|
|
||||||
browser,
|
|
||||||
browser_version,
|
|
||||||
device,
|
|
||||||
brand,
|
|
||||||
model,
|
|
||||||
imported_at
|
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
id,
|
id, name, sdk_name, sdk_version, device_id, profile_id, project_id,
|
||||||
name,
|
session_id, path, origin, referrer, referrer_name, referrer_type,
|
||||||
sdk_name,
|
duration, properties, created_at, country, city, region,
|
||||||
sdk_version,
|
longitude, latitude, os, os_version, browser, browser_version,
|
||||||
device_id,
|
device, brand, model, imported_at
|
||||||
profile_id,
|
|
||||||
project_id,
|
|
||||||
session_id,
|
|
||||||
path,
|
|
||||||
origin,
|
|
||||||
referrer,
|
|
||||||
referrer_name,
|
|
||||||
referrer_type,
|
|
||||||
duration,
|
|
||||||
properties,
|
|
||||||
created_at,
|
|
||||||
country,
|
|
||||||
city,
|
|
||||||
region,
|
|
||||||
longitude,
|
|
||||||
latitude,
|
|
||||||
os,
|
|
||||||
os_version,
|
|
||||||
browser,
|
|
||||||
browser_version,
|
|
||||||
device,
|
|
||||||
brand,
|
|
||||||
model,
|
|
||||||
imported_at
|
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
WHERE ${whereClause}
|
WHERE ${whereClause}
|
||||||
ORDER BY created_at ASC
|
ORDER BY created_at ASC
|
||||||
@@ -452,60 +483,54 @@ export async function moveImportsToProduction(
|
|||||||
query_params: { importId, from },
|
query_params: { importId, from },
|
||||||
clickhouse_settings: {
|
clickhouse_settings: {
|
||||||
wait_end_of_query: 1,
|
wait_end_of_query: 1,
|
||||||
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
|
|
||||||
send_progress_in_http_headers: 1,
|
send_progress_in_http_headers: 1,
|
||||||
// The interval of sending these progress headers. Here it is less than 60s,
|
|
||||||
http_headers_progress_interval_ms: '50000',
|
http_headers_progress_interval_ms: '50000',
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregate sessions from staging into the sessions table.
|
||||||
|
* Runs across all dates so cross-midnight sessions become one row.
|
||||||
|
* Batches by session_ids to bound ClickHouse memory.
|
||||||
|
*/
|
||||||
export async function backfillSessionsToProduction(
|
export async function backfillSessionsToProduction(
|
||||||
importId: string,
|
importId: string
|
||||||
from: string,
|
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
// After migrating events, populate the sessions table based on the migrated sessions
|
const SESSION_BATCH_SIZE = 5000;
|
||||||
// We detect all session_ids involved in this import from the imports table,
|
let lastSessionId = '';
|
||||||
// then aggregate over the production events to construct session rows.
|
|
||||||
|
while (true) {
|
||||||
|
const idsResult = await ch.query({
|
||||||
|
query: `
|
||||||
|
SELECT DISTINCT session_id
|
||||||
|
FROM ${TABLE_NAMES.events_imports}
|
||||||
|
WHERE import_id = {importId:String}
|
||||||
|
AND session_id > {lastSessionId:String}
|
||||||
|
ORDER BY session_id
|
||||||
|
LIMIT {limit:UInt32}
|
||||||
|
`,
|
||||||
|
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
|
||||||
|
format: 'JSONEachRow',
|
||||||
|
});
|
||||||
|
|
||||||
|
const idRows = (await idsResult.json()) as Array<{ session_id: string }>;
|
||||||
|
if (idRows.length === 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sessionIds = idRows.map((r) => r.session_id);
|
||||||
|
|
||||||
const sessionsInsertQuery = `
|
const sessionsInsertQuery = `
|
||||||
INSERT INTO ${TABLE_NAMES.sessions} (
|
INSERT INTO ${TABLE_NAMES.sessions} (
|
||||||
id,
|
id, project_id, profile_id, device_id, created_at, ended_at,
|
||||||
project_id,
|
is_bounce, entry_origin, entry_path, exit_origin, exit_path,
|
||||||
profile_id,
|
screen_view_count, revenue, event_count, duration,
|
||||||
device_id,
|
country, region, city, longitude, latitude,
|
||||||
created_at,
|
device, brand, model, browser, browser_version, os, os_version,
|
||||||
ended_at,
|
sign, version,
|
||||||
is_bounce,
|
utm_medium, utm_source, utm_campaign, utm_content, utm_term,
|
||||||
entry_origin,
|
referrer, referrer_name, referrer_type
|
||||||
entry_path,
|
|
||||||
exit_origin,
|
|
||||||
exit_path,
|
|
||||||
screen_view_count,
|
|
||||||
revenue,
|
|
||||||
event_count,
|
|
||||||
duration,
|
|
||||||
country,
|
|
||||||
region,
|
|
||||||
city,
|
|
||||||
longitude,
|
|
||||||
latitude,
|
|
||||||
device,
|
|
||||||
brand,
|
|
||||||
model,
|
|
||||||
browser,
|
|
||||||
browser_version,
|
|
||||||
os,
|
|
||||||
os_version,
|
|
||||||
sign,
|
|
||||||
version,
|
|
||||||
utm_medium,
|
|
||||||
utm_source,
|
|
||||||
utm_campaign,
|
|
||||||
utm_content,
|
|
||||||
utm_term,
|
|
||||||
referrer,
|
|
||||||
referrer_name,
|
|
||||||
referrer_type
|
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
any(e.session_id) as id,
|
any(e.session_id) as id,
|
||||||
@@ -551,128 +576,34 @@ export async function backfillSessionsToProduction(
|
|||||||
argMinIf(e.referrer_type, e.created_at, e.name = 'session_start') as referrer_type
|
argMinIf(e.referrer_type, e.created_at, e.name = 'session_start') as referrer_type
|
||||||
FROM ${TABLE_NAMES.events_imports} e
|
FROM ${TABLE_NAMES.events_imports} e
|
||||||
WHERE
|
WHERE
|
||||||
e.import_id = ${sqlstring.escape(importId)}
|
e.import_id = {importId:String}
|
||||||
AND e.session_id != ''
|
AND e.session_id IN ({sessionIds:Array(String)})
|
||||||
AND (
|
|
||||||
(toDate(e.created_at) = ${sqlstring.escape(from)}) OR
|
|
||||||
(
|
|
||||||
e.name IN ('session_start', 'session_end') AND
|
|
||||||
e.session_id IN (
|
|
||||||
SELECT DISTINCT session_id
|
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
|
||||||
WHERE import_id = ${sqlstring.escape(importId)}
|
|
||||||
AND toDate(created_at) = ${sqlstring.escape(from)}
|
|
||||||
AND name NOT IN ('session_start', 'session_end')
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
GROUP BY e.session_id
|
GROUP BY e.session_id
|
||||||
`;
|
`;
|
||||||
|
|
||||||
await ch.command({
|
await ch.command({
|
||||||
query: sessionsInsertQuery,
|
query: sessionsInsertQuery,
|
||||||
|
query_params: { importId, sessionIds },
|
||||||
clickhouse_settings: {
|
clickhouse_settings: {
|
||||||
wait_end_of_query: 1,
|
wait_end_of_query: 1,
|
||||||
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
|
|
||||||
send_progress_in_http_headers: 1,
|
send_progress_in_http_headers: 1,
|
||||||
// The interval of sending these progress headers. Here it is less than 60s,
|
|
||||||
http_headers_progress_interval_ms: '50000',
|
http_headers_progress_interval_ms: '50000',
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
lastSessionId = idRows[idRows.length - 1]!.session_id;
|
||||||
|
if (idRows.length < SESSION_BATCH_SIZE) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mark import as complete by updating status
|
* Get min/max created_at for an import's staging data.
|
||||||
*/
|
|
||||||
export async function markImportComplete(importId: string): Promise<void> {
|
|
||||||
// In clustered mode, we must use the replicated table for mutations
|
|
||||||
const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports);
|
|
||||||
const updateQuery = `
|
|
||||||
ALTER TABLE ${mutationTableName}
|
|
||||||
UPDATE import_status = 'processed'
|
|
||||||
WHERE import_id = {importId:String}
|
|
||||||
`;
|
|
||||||
|
|
||||||
await ch.command({
|
|
||||||
query: updateQuery,
|
|
||||||
query_params: { importId },
|
|
||||||
clickhouse_settings: {
|
|
||||||
wait_end_of_query: 1,
|
|
||||||
mutations_sync: '2', // Wait for mutation to complete
|
|
||||||
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
|
|
||||||
send_progress_in_http_headers: 1,
|
|
||||||
// The interval of sending these progress headers. Here it is less than 60s,
|
|
||||||
http_headers_progress_interval_ms: '50000',
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get import progress and status
|
|
||||||
*/
|
|
||||||
export async function getImportProgress(
|
|
||||||
importId: string,
|
|
||||||
): Promise<ImportProgress> {
|
|
||||||
const progressQuery = `
|
|
||||||
SELECT
|
|
||||||
import_id,
|
|
||||||
COUNT(*) as total_events,
|
|
||||||
COUNTIf(import_status = 'pending') as pending_events,
|
|
||||||
COUNTIf(import_status = 'processed') as processed_events,
|
|
||||||
any(import_status) as status
|
|
||||||
FROM ${TABLE_NAMES.events_imports}
|
|
||||||
WHERE import_id = {importId:String}
|
|
||||||
AND name NOT IN ('session_start', 'session_end')
|
|
||||||
GROUP BY import_id
|
|
||||||
`;
|
|
||||||
|
|
||||||
const result = await ch.query({
|
|
||||||
query: progressQuery,
|
|
||||||
query_params: { importId },
|
|
||||||
format: 'JSONEachRow',
|
|
||||||
});
|
|
||||||
|
|
||||||
const data = (await result.json()) as Array<{
|
|
||||||
import_id: string;
|
|
||||||
total_events: number;
|
|
||||||
pending_events: number;
|
|
||||||
processed_events: number;
|
|
||||||
status: string;
|
|
||||||
}>;
|
|
||||||
|
|
||||||
if (data.length === 0) {
|
|
||||||
return {
|
|
||||||
importId,
|
|
||||||
totalEvents: 0,
|
|
||||||
insertedEvents: 0,
|
|
||||||
status: 'pending',
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const row = data[0];
|
|
||||||
if (!row) {
|
|
||||||
return {
|
|
||||||
importId,
|
|
||||||
totalEvents: 0,
|
|
||||||
insertedEvents: 0,
|
|
||||||
status: 'pending',
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
importId,
|
|
||||||
totalEvents: row.total_events,
|
|
||||||
insertedEvents: row.processed_events,
|
|
||||||
status: row.status as 'pending' | 'processing' | 'processed' | 'failed',
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Utility: get min/max created_at for an import
|
|
||||||
*/
|
*/
|
||||||
export async function getImportDateBounds(
|
export async function getImportDateBounds(
|
||||||
importId: string,
|
importId: string,
|
||||||
fromCreatedAt?: string,
|
fromCreatedAt?: string
|
||||||
): Promise<{ min: string | null; max: string | null }> {
|
): Promise<{ min: string | null; max: string | null }> {
|
||||||
const res = await ch.query({
|
const res = await ch.query({
|
||||||
query: `
|
query: `
|
||||||
@@ -697,10 +628,6 @@ export async function getImportDateBounds(
|
|||||||
: { min: null, max: null };
|
: { min: null, max: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Unified method to update all import status information
|
|
||||||
* Combines step, batch, progress, and status message updates
|
|
||||||
*/
|
|
||||||
export type UpdateImportStatusOptions =
|
export type UpdateImportStatusOptions =
|
||||||
| {
|
| {
|
||||||
step: 'loading';
|
step: 'loading';
|
||||||
@@ -709,13 +636,17 @@ export type UpdateImportStatusOptions =
|
|||||||
processedEvents?: number;
|
processedEvents?: number;
|
||||||
}
|
}
|
||||||
| {
|
| {
|
||||||
step: 'generating_session_ids';
|
step: 'loading_profiles';
|
||||||
batch?: string;
|
processedProfiles?: number;
|
||||||
|
totalProfiles?: number;
|
||||||
}
|
}
|
||||||
| {
|
| {
|
||||||
step: 'creating_sessions';
|
step: 'creating_sessions';
|
||||||
batch?: string;
|
batch?: string;
|
||||||
}
|
}
|
||||||
|
| {
|
||||||
|
step: 'generating_sessions';
|
||||||
|
}
|
||||||
| {
|
| {
|
||||||
step: 'moving';
|
step: 'moving';
|
||||||
batch?: string;
|
batch?: string;
|
||||||
@@ -740,7 +671,7 @@ export async function updateImportStatus(
|
|||||||
updateProgress: (progress: Record<string, any>) => void;
|
updateProgress: (progress: Record<string, any>) => void;
|
||||||
},
|
},
|
||||||
importId: string,
|
importId: string,
|
||||||
options: UpdateImportStatusOptions,
|
options: UpdateImportStatusOptions
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const data: Prisma.ImportUpdateInput = {};
|
const data: Prisma.ImportUpdateInput = {};
|
||||||
switch (options.step) {
|
switch (options.step) {
|
||||||
@@ -754,27 +685,35 @@ export async function updateImportStatus(
|
|||||||
data.totalEvents = options.totalEvents;
|
data.totalEvents = options.totalEvents;
|
||||||
data.processedEvents = options.processedEvents;
|
data.processedEvents = options.processedEvents;
|
||||||
break;
|
break;
|
||||||
case 'generating_session_ids':
|
case 'loading_profiles':
|
||||||
data.currentStep = 'generating_session_ids';
|
data.currentStep = 'loading_profiles';
|
||||||
data.currentBatch = options.batch;
|
data.statusMessage =
|
||||||
data.statusMessage = options.batch
|
options.processedProfiles != null && options.totalProfiles != null
|
||||||
? `Generating session IDs for ${options.batch}`
|
? `Importing user profiles (${options.processedProfiles} / ${options.totalProfiles})`
|
||||||
: 'Generating session IDs...';
|
: 'Importing user profiles...';
|
||||||
break;
|
break;
|
||||||
case 'creating_sessions':
|
case 'creating_sessions':
|
||||||
data.currentStep = 'creating_sessions';
|
data.currentStep = 'creating_sessions';
|
||||||
data.currentBatch = options.batch;
|
data.currentBatch = options.batch;
|
||||||
data.statusMessage = `Creating sessions for ${options.batch}`;
|
data.statusMessage = options.batch
|
||||||
|
? `Creating sessions (${options.batch})`
|
||||||
|
: 'Creating sessions...';
|
||||||
|
break;
|
||||||
|
case 'generating_sessions':
|
||||||
|
data.currentStep = 'generating_sessions';
|
||||||
|
data.statusMessage = 'Generating session IDs...';
|
||||||
break;
|
break;
|
||||||
case 'moving':
|
case 'moving':
|
||||||
data.currentStep = 'moving';
|
data.currentStep = 'moving';
|
||||||
data.currentBatch = options.batch;
|
data.currentBatch = options.batch;
|
||||||
data.statusMessage = `Moving imports to production for ${options.batch}`;
|
data.statusMessage = `Moving events to production (${options.batch})`;
|
||||||
break;
|
break;
|
||||||
case 'backfilling_sessions':
|
case 'backfilling_sessions':
|
||||||
data.currentStep = 'backfilling_sessions';
|
data.currentStep = 'backfilling_sessions';
|
||||||
data.currentBatch = options.batch;
|
data.currentBatch = options.batch;
|
||||||
data.statusMessage = `Aggregating sessions for ${options.batch}`;
|
data.statusMessage = options.batch
|
||||||
|
? `Aggregating sessions (${options.batch})`
|
||||||
|
: 'Aggregating sessions...';
|
||||||
break;
|
break;
|
||||||
case 'completed':
|
case 'completed':
|
||||||
data.status = 'completed';
|
data.status = 'completed';
|
||||||
@@ -787,6 +726,8 @@ export async function updateImportStatus(
|
|||||||
data.statusMessage = 'Import failed';
|
data.statusMessage = 'Import failed';
|
||||||
data.errorMessage = options.errorMessage;
|
data.errorMessage = options.errorMessage;
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
jobLogger.info('Import status update', data);
|
jobLogger.info('Import status update', data);
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ describe('mixpanel', () => {
|
|||||||
const rawEvent = {
|
const rawEvent = {
|
||||||
event: '$mp_web_page_view',
|
event: '$mp_web_page_view',
|
||||||
properties: {
|
properties: {
|
||||||
time: 1746097970,
|
time: 1_746_097_970,
|
||||||
distinct_id: '$device:123',
|
distinct_id: '$device:123',
|
||||||
$browser: 'Chrome',
|
$browser: 'Chrome',
|
||||||
$browser_version: 135,
|
$browser_version: 135,
|
||||||
@@ -53,7 +53,7 @@ describe('mixpanel', () => {
|
|||||||
$insert_id: 'source_id',
|
$insert_id: 'source_id',
|
||||||
$lib_version: '2.60.0',
|
$lib_version: '2.60.0',
|
||||||
$mp_api_endpoint: 'api-js.mixpanel.com',
|
$mp_api_endpoint: 'api-js.mixpanel.com',
|
||||||
$mp_api_timestamp_ms: 1746078175363,
|
$mp_api_timestamp_ms: 1_746_078_175_363,
|
||||||
$mp_autocapture: true,
|
$mp_autocapture: true,
|
||||||
$os: 'Android',
|
$os: 'Android',
|
||||||
$referrer: 'https://google.com/',
|
$referrer: 'https://google.com/',
|
||||||
@@ -71,7 +71,7 @@ describe('mixpanel', () => {
|
|||||||
gclid: 'oqneoqow',
|
gclid: 'oqneoqow',
|
||||||
mp_country_code: 'IN',
|
mp_country_code: 'IN',
|
||||||
mp_lib: 'web',
|
mp_lib: 'web',
|
||||||
mp_processing_time_ms: 1746078175546,
|
mp_processing_time_ms: 1_746_078_175_546,
|
||||||
mp_sent_by_lib_version: '2.60.0',
|
mp_sent_by_lib_version: '2.60.0',
|
||||||
utm_medium: 'cpc',
|
utm_medium: 'cpc',
|
||||||
utm_source: 'google',
|
utm_source: 'google',
|
||||||
@@ -101,7 +101,7 @@ describe('mixpanel', () => {
|
|||||||
__title:
|
__title:
|
||||||
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
|
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
|
||||||
},
|
},
|
||||||
created_at: '2025-05-01T11:12:50.000Z',
|
created_at: '2025-05-01 11:12:50',
|
||||||
country: 'IN',
|
country: 'IN',
|
||||||
city: 'Mumbai',
|
city: 'Mumbai',
|
||||||
region: 'Maharashtra',
|
region: 'Maharashtra',
|
||||||
@@ -110,7 +110,7 @@ describe('mixpanel', () => {
|
|||||||
os: 'Android',
|
os: 'Android',
|
||||||
os_version: undefined,
|
os_version: undefined,
|
||||||
browser: 'Chrome',
|
browser: 'Chrome',
|
||||||
browser_version: '',
|
browser_version: '135',
|
||||||
device: 'mobile',
|
device: 'mobile',
|
||||||
brand: '',
|
brand: '',
|
||||||
model: '',
|
model: '',
|
||||||
@@ -141,7 +141,7 @@ describe('mixpanel', () => {
|
|||||||
const rawEvent = {
|
const rawEvent = {
|
||||||
event: 'custom_event',
|
event: 'custom_event',
|
||||||
properties: {
|
properties: {
|
||||||
time: 1746097970,
|
time: 1_746_097_970,
|
||||||
distinct_id: '$device:123',
|
distinct_id: '$device:123',
|
||||||
$device_id: '123',
|
$device_id: '123',
|
||||||
$user_id: 'user123',
|
$user_id: 'user123',
|
||||||
@@ -192,7 +192,7 @@ describe('mixpanel', () => {
|
|||||||
const rawEvent = {
|
const rawEvent = {
|
||||||
event: 'ec_search_error',
|
event: 'ec_search_error',
|
||||||
properties: {
|
properties: {
|
||||||
time: 1759947367,
|
time: 1_759_947_367,
|
||||||
distinct_id: '3385916',
|
distinct_id: '3385916',
|
||||||
$browser: 'Mobile Safari',
|
$browser: 'Mobile Safari',
|
||||||
$browser_version: null,
|
$browser_version: null,
|
||||||
@@ -207,7 +207,7 @@ describe('mixpanel', () => {
|
|||||||
$insert_id: 'bclkaepeqcfuzt4v',
|
$insert_id: 'bclkaepeqcfuzt4v',
|
||||||
$lib_version: '2.60.0',
|
$lib_version: '2.60.0',
|
||||||
$mp_api_endpoint: 'api-js.mixpanel.com',
|
$mp_api_endpoint: 'api-js.mixpanel.com',
|
||||||
$mp_api_timestamp_ms: 1759927570699,
|
$mp_api_timestamp_ms: 1_759_927_570_699,
|
||||||
$os: 'iOS',
|
$os: 'iOS',
|
||||||
$region: 'Karnataka',
|
$region: 'Karnataka',
|
||||||
$screen_height: 852,
|
$screen_height: 852,
|
||||||
@@ -225,7 +225,7 @@ describe('mixpanel', () => {
|
|||||||
language: 'english',
|
language: 'english',
|
||||||
mp_country_code: 'IN',
|
mp_country_code: 'IN',
|
||||||
mp_lib: 'web',
|
mp_lib: 'web',
|
||||||
mp_processing_time_ms: 1759927592421,
|
mp_processing_time_ms: 1_759_927_592_421,
|
||||||
mp_sent_by_lib_version: '2.60.0',
|
mp_sent_by_lib_version: '2.60.0',
|
||||||
os: 'web',
|
os: 'web',
|
||||||
osVersion:
|
osVersion:
|
||||||
@@ -249,15 +249,15 @@ describe('mixpanel', () => {
|
|||||||
|
|
||||||
expect(res.id.length).toBeGreaterThan(30);
|
expect(res.id.length).toBeGreaterThan(30);
|
||||||
expect(res.imported_at).toMatch(
|
expect(res.imported_at).toMatch(
|
||||||
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
|
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
|
||||||
);
|
);
|
||||||
expect(omit(['id', 'imported_at'], res)).toEqual({
|
expect(omit(['id', 'imported_at'], res)).toEqual({
|
||||||
brand: 'Apple',
|
brand: 'Apple',
|
||||||
browser: 'GSA',
|
browser: 'GSA',
|
||||||
browser_version: 'null',
|
browser_version: '388.0.811331708',
|
||||||
city: 'Bengaluru',
|
city: 'Bengaluru',
|
||||||
country: 'IN',
|
country: 'IN',
|
||||||
created_at: '2025-10-08T18:16:07.000Z',
|
created_at: '2025-10-08 18:16:07',
|
||||||
device: 'mobile',
|
device: 'mobile',
|
||||||
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
|
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
|
||||||
duration: 0,
|
duration: 0,
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
import { randomUUID } from 'node:crypto';
|
import { randomUUID } from 'node:crypto';
|
||||||
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
|
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
|
||||||
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server';
|
import {
|
||||||
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
|
getReferrerWithQuery,
|
||||||
import type { IClickhouseEvent } from '@openpanel/db';
|
parseReferrer,
|
||||||
|
parseUserAgent,
|
||||||
|
type UserAgentInfo,
|
||||||
|
} from '@openpanel/common/server';
|
||||||
|
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
|
||||||
|
import type { IClickhouseProfile } from '@openpanel/db';
|
||||||
import type { ILogger } from '@openpanel/logger';
|
import type { ILogger } from '@openpanel/logger';
|
||||||
import type { IMixpanelImportConfig } from '@openpanel/validation';
|
import type { IMixpanelImportConfig } from '@openpanel/validation';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
@@ -15,22 +20,88 @@ export const zMixpanelRawEvent = z.object({
|
|||||||
|
|
||||||
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
|
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
|
||||||
|
|
||||||
|
/** Engage API profile: https://docs.mixpanel.com/docs/export-methods#exporting-profiles */
|
||||||
|
export const zMixpanelRawProfile = z.object({
|
||||||
|
$distinct_id: z.union([z.string(), z.number()]),
|
||||||
|
$properties: z.record(z.unknown()).optional().default({}),
|
||||||
|
});
|
||||||
|
export type MixpanelRawProfile = z.infer<typeof zMixpanelRawProfile>;
|
||||||
|
|
||||||
|
class MixpanelRateLimitError extends Error {
|
||||||
|
readonly retryAfterMs?: number;
|
||||||
|
|
||||||
|
constructor(message: string, retryAfterMs?: number) {
|
||||||
|
super(message);
|
||||||
|
this.name = 'MixpanelRateLimitError';
|
||||||
|
this.retryAfterMs = retryAfterMs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||||
provider = 'mixpanel';
|
provider = 'mixpanel';
|
||||||
version = '1.0.0';
|
version = '1.0.0';
|
||||||
|
|
||||||
|
private static readonly MAX_REQUESTS_PER_HOUR = 100;
|
||||||
|
private static readonly MIN_REQUEST_INTERVAL_MS = 334; // 3 QPS limit
|
||||||
|
private requestTimestamps: number[] = [];
|
||||||
|
private lastRequestTime = 0;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
private readonly projectId: string,
|
private readonly projectId: string,
|
||||||
private readonly config: IMixpanelImportConfig,
|
private readonly config: IMixpanelImportConfig,
|
||||||
private readonly logger?: ILogger,
|
private readonly logger?: ILogger
|
||||||
) {
|
) {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
async getTotalEventsCount(): Promise<number> {
|
private async waitForRateLimit(): Promise<void> {
|
||||||
|
const now = Date.now();
|
||||||
|
const oneHourAgo = now - 60 * 60 * 1000;
|
||||||
|
|
||||||
|
// Prune timestamps older than 1 hour
|
||||||
|
this.requestTimestamps = this.requestTimestamps.filter(
|
||||||
|
(t) => t > oneHourAgo
|
||||||
|
);
|
||||||
|
|
||||||
|
// Enforce per-second limit (3 QPS → min 334ms gap)
|
||||||
|
const timeSinceLast = now - this.lastRequestTime;
|
||||||
|
if (timeSinceLast < MixpanelProvider.MIN_REQUEST_INTERVAL_MS) {
|
||||||
|
const delay = MixpanelProvider.MIN_REQUEST_INTERVAL_MS - timeSinceLast;
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enforce hourly limit
|
||||||
|
if (
|
||||||
|
this.requestTimestamps.length >= MixpanelProvider.MAX_REQUESTS_PER_HOUR
|
||||||
|
) {
|
||||||
|
const oldestInWindow = this.requestTimestamps[0]!;
|
||||||
|
const waitUntil = oldestInWindow + 60 * 60 * 1000;
|
||||||
|
const waitMs = waitUntil - Date.now() + 1000; // +1s buffer
|
||||||
|
|
||||||
|
if (waitMs > 0) {
|
||||||
|
this.logger?.info(
|
||||||
|
`Rate limit: ${this.requestTimestamps.length} requests in the last hour, waiting ${Math.ceil(waitMs / 1000)}s`,
|
||||||
|
{
|
||||||
|
requestsInWindow: this.requestTimestamps.length,
|
||||||
|
waitMs,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, waitMs));
|
||||||
|
// Prune again after waiting
|
||||||
|
this.requestTimestamps = this.requestTimestamps.filter(
|
||||||
|
(t) => t > Date.now() - 60 * 60 * 1000
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.lastRequestTime = Date.now();
|
||||||
|
this.requestTimestamps.push(Date.now());
|
||||||
|
}
|
||||||
|
|
||||||
|
getTotalEventsCount(): Promise<number> {
|
||||||
// Mixpanel sucks and dont provide a good way to extract total event count within a period
|
// Mixpanel sucks and dont provide a good way to extract total event count within a period
|
||||||
// jql would work but not accurate and will be deprecated end of 2025
|
// jql would work but not accurate and will be deprecated end of 2025
|
||||||
return -1;
|
return Promise.resolve(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -42,13 +113,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async *parseSource(
|
async *parseSource(
|
||||||
overrideFrom?: string,
|
overrideFrom?: string
|
||||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||||
yield* this.fetchEventsFromMixpanel(overrideFrom);
|
yield* this.fetchEventsFromMixpanel(overrideFrom);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async *fetchEventsFromMixpanel(
|
private async *fetchEventsFromMixpanel(
|
||||||
overrideFrom?: string,
|
overrideFrom?: string
|
||||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||||
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
|
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
|
||||||
|
|
||||||
@@ -58,20 +129,24 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
|
|
||||||
for (const [chunkFrom, chunkTo] of dateChunks) {
|
for (const [chunkFrom, chunkTo] of dateChunks) {
|
||||||
let retries = 0;
|
let retries = 0;
|
||||||
const maxRetries = 3;
|
const maxRetries = 6;
|
||||||
|
|
||||||
while (retries <= maxRetries) {
|
while (retries <= maxRetries) {
|
||||||
try {
|
try {
|
||||||
|
await this.waitForRateLimit();
|
||||||
yield* this.fetchEventsForDateRange(
|
yield* this.fetchEventsForDateRange(
|
||||||
serviceAccount,
|
serviceAccount,
|
||||||
serviceSecret,
|
serviceSecret,
|
||||||
projectId,
|
projectId,
|
||||||
chunkFrom,
|
chunkFrom,
|
||||||
chunkTo,
|
chunkTo
|
||||||
);
|
);
|
||||||
break; // Success, move to next chunk
|
break; // Success, move to next chunk
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
retries++;
|
retries++;
|
||||||
|
const isRateLimit =
|
||||||
|
error instanceof MixpanelRateLimitError ||
|
||||||
|
(error instanceof Error && error.message.includes('429'));
|
||||||
const isLastRetry = retries > maxRetries;
|
const isLastRetry = retries > maxRetries;
|
||||||
|
|
||||||
this.logger?.warn('Failed to fetch events for date range', {
|
this.logger?.warn('Failed to fetch events for date range', {
|
||||||
@@ -80,22 +155,31 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
attempt: retries,
|
attempt: retries,
|
||||||
maxRetries,
|
maxRetries,
|
||||||
error: (error as Error).message,
|
error: (error as Error).message,
|
||||||
|
isRateLimit,
|
||||||
willRetry: !isLastRetry,
|
willRetry: !isLastRetry,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (isLastRetry) {
|
if (isLastRetry) {
|
||||||
// Final attempt failed, re-throw
|
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`,
|
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exponential backoff: wait before retrying
|
let delay: number;
|
||||||
const delay = Math.min(1000 * 2 ** (retries - 1), 60_000); // Cap at 1 minute
|
if (error instanceof MixpanelRateLimitError && error.retryAfterMs) {
|
||||||
|
delay = error.retryAfterMs;
|
||||||
|
} else if (isRateLimit) {
|
||||||
|
// 5min → 10min → 15min → 15min → 15min = 60min total
|
||||||
|
delay = Math.min(300_000 * 2 ** (retries - 1), 900_000);
|
||||||
|
} else {
|
||||||
|
delay = Math.min(1000 * 2 ** (retries - 1), 60_000);
|
||||||
|
}
|
||||||
|
|
||||||
this.logger?.info('Retrying after delay', {
|
this.logger?.info('Retrying after delay', {
|
||||||
delayMs: delay,
|
delayMs: delay,
|
||||||
chunkFrom,
|
chunkFrom,
|
||||||
chunkTo,
|
chunkTo,
|
||||||
|
isRateLimit,
|
||||||
});
|
});
|
||||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
}
|
}
|
||||||
@@ -108,7 +192,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
serviceSecret: string,
|
serviceSecret: string,
|
||||||
projectId: string,
|
projectId: string,
|
||||||
from: string,
|
from: string,
|
||||||
to: string,
|
to: string
|
||||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||||
const url = 'https://data.mixpanel.com/api/2.0/export';
|
const url = 'https://data.mixpanel.com/api/2.0/export';
|
||||||
|
|
||||||
@@ -134,9 +218,18 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (response.status === 429) {
|
||||||
|
const retryAfter = response.headers.get('Retry-After');
|
||||||
|
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
|
||||||
|
throw new MixpanelRateLimitError(
|
||||||
|
'Mixpanel rate limit exceeded (429)',
|
||||||
|
retryAfterMs
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`,
|
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +246,9 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
while (true) {
|
while (true) {
|
||||||
const { done, value } = await reader.read();
|
const { done, value } = await reader.read();
|
||||||
|
|
||||||
if (done) break;
|
if (done) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
buffer += decoder.decode(value, { stream: true });
|
buffer += decoder.decode(value, { stream: true });
|
||||||
|
|
||||||
@@ -187,7 +282,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
{
|
{
|
||||||
line: buffer.substring(0, 100),
|
line: buffer.substring(0, 100),
|
||||||
error,
|
error,
|
||||||
},
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -196,6 +291,114 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stream user profiles from Mixpanel Engage API.
|
||||||
|
* Paginates with page/page_size (5k per page) and yields each profile.
|
||||||
|
*/
|
||||||
|
async *streamProfiles(): AsyncGenerator<MixpanelRawProfile, void, unknown> {
|
||||||
|
const { serviceAccount, serviceSecret, projectId } = this.config;
|
||||||
|
const pageSize = 5000;
|
||||||
|
let page = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
await this.waitForRateLimit();
|
||||||
|
|
||||||
|
const url = `https://mixpanel.com/api/query/engage?project_id=${encodeURIComponent(projectId)}`;
|
||||||
|
const body = new URLSearchParams({
|
||||||
|
page: String(page),
|
||||||
|
page_size: String(pageSize),
|
||||||
|
});
|
||||||
|
|
||||||
|
this.logger?.info('Fetching profiles from Mixpanel Engage', {
|
||||||
|
page,
|
||||||
|
page_size: pageSize,
|
||||||
|
projectId,
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
|
||||||
|
Accept: 'application/json',
|
||||||
|
'Content-Type': 'application/x-www-form-urlencoded',
|
||||||
|
},
|
||||||
|
body: body.toString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.status === 429) {
|
||||||
|
const retryAfter = response.headers.get('Retry-After');
|
||||||
|
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
|
||||||
|
throw new MixpanelRateLimitError(
|
||||||
|
'Mixpanel rate limit exceeded (429)',
|
||||||
|
retryAfterMs
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const text = await response.text();
|
||||||
|
throw new Error(
|
||||||
|
`Failed to fetch profiles from Mixpanel: ${response.status} ${response.statusText} - ${text}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = (await response.json()) as {
|
||||||
|
results?: Array<{ $distinct_id: string | number; $properties?: Record<string, unknown> }>;
|
||||||
|
page?: number;
|
||||||
|
total?: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
const results = data.results ?? [];
|
||||||
|
for (const row of results) {
|
||||||
|
const parsed = zMixpanelRawProfile.safeParse(row);
|
||||||
|
if (parsed.success) {
|
||||||
|
yield parsed.data;
|
||||||
|
} else {
|
||||||
|
this.logger?.warn('Skipping invalid Mixpanel profile', {
|
||||||
|
row: JSON.stringify(row).slice(0, 200),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results.length < pageSize) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
page++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map Mixpanel Engage profile to OpenPanel IClickhouseProfile.
|
||||||
|
*/
|
||||||
|
transformProfile(raw: MixpanelRawProfile): IClickhouseProfile {
|
||||||
|
const parsed = zMixpanelRawProfile.parse(raw);
|
||||||
|
const props = (parsed.$properties || {}) as Record<string, unknown>;
|
||||||
|
|
||||||
|
const id = String(parsed.$distinct_id).replace(/^\$device:/, '');
|
||||||
|
const createdAt = props.$created
|
||||||
|
? formatClickhouseDate(new Date(String(props.$created)))
|
||||||
|
: formatClickhouseDate(new Date());
|
||||||
|
|
||||||
|
const properties: Record<string, string> = {};
|
||||||
|
const stripPrefix = /^\$/;
|
||||||
|
for (const [key, value] of Object.entries(props)) {
|
||||||
|
if (stripPrefix.test(key)) continue;
|
||||||
|
if (value == null) continue;
|
||||||
|
properties[key] = typeof value === 'object' ? JSON.stringify(value) : String(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
project_id: this.projectId,
|
||||||
|
first_name: String(props.$first_name ?? ''),
|
||||||
|
last_name: String(props.$last_name ?? ''),
|
||||||
|
email: String(props.$email ?? ''),
|
||||||
|
avatar: String(props.$avatar ?? props.$image ?? ''),
|
||||||
|
properties,
|
||||||
|
created_at: createdAt,
|
||||||
|
is_external: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
validate(rawEvent: MixpanelRawEvent): boolean {
|
validate(rawEvent: MixpanelRawEvent): boolean {
|
||||||
const res = zMixpanelRawEvent.safeParse(rawEvent);
|
const res = zMixpanelRawEvent.safeParse(rawEvent);
|
||||||
return res.success;
|
return res.success;
|
||||||
@@ -208,7 +411,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
const deviceId = props.$device_id;
|
const deviceId = props.$device_id;
|
||||||
const profileId = String(props.$user_id || props.distinct_id).replace(
|
const profileId = String(props.$user_id || props.distinct_id).replace(
|
||||||
/^\$device:/,
|
/^\$device:/,
|
||||||
'',
|
''
|
||||||
);
|
);
|
||||||
|
|
||||||
// Build full URL from current_url and current_url_search (web only)
|
// Build full URL from current_url and current_url_search (web only)
|
||||||
@@ -309,7 +512,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
project_id: projectId,
|
project_id: projectId,
|
||||||
session_id: '', // Will be generated in SQL after import
|
session_id: '', // Will be generated in SQL after import
|
||||||
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
|
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
|
||||||
created_at: new Date(props.time * 1000).toISOString(),
|
created_at: formatClickhouseDate(new Date(props.time * 1000)),
|
||||||
country,
|
country,
|
||||||
city,
|
city,
|
||||||
region,
|
region,
|
||||||
@@ -318,10 +521,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
os: uaInfo.os || props.$os,
|
os: uaInfo.os || props.$os,
|
||||||
os_version: uaInfo.osVersion || props.$osVersion,
|
os_version: uaInfo.osVersion || props.$osVersion,
|
||||||
browser: uaInfo.browser || props.$browser,
|
browser: uaInfo.browser || props.$browser,
|
||||||
browser_version:
|
browser_version: uaInfo.browserVersion || String(props.$browser_version ?? ''),
|
||||||
uaInfo.browserVersion || props.$browserVersion
|
|
||||||
? String(props.$browser_version)
|
|
||||||
: '',
|
|
||||||
device: this.getDeviceType(props.mp_lib, uaInfo, props),
|
device: this.getDeviceType(props.mp_lib, uaInfo, props),
|
||||||
brand: uaInfo.brand || '',
|
brand: uaInfo.brand || '',
|
||||||
model: uaInfo.model || '',
|
model: uaInfo.model || '',
|
||||||
@@ -338,14 +538,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
sdk_version: this.version,
|
sdk_version: this.version,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Remove this
|
|
||||||
// Temporary fix for a client
|
|
||||||
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
|
|
||||||
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
|
|
||||||
event.name = 'screen_view';
|
|
||||||
event.path = isMightBeScreenView;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Remove this
|
// TODO: Remove this
|
||||||
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
|
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
|
||||||
if (props.utm_source && !properties.__query?.utm_source) {
|
if (props.utm_source && !properties.__query?.utm_source) {
|
||||||
@@ -371,13 +563,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
private getDeviceType(
|
private getDeviceType(
|
||||||
mp_lib: string,
|
mp_lib: string,
|
||||||
uaInfo: UserAgentInfo,
|
uaInfo: UserAgentInfo,
|
||||||
props: Record<string, any>,
|
props: Record<string, any>
|
||||||
) {
|
) {
|
||||||
// Normalize lib/os/browser data
|
// Normalize lib/os/browser data
|
||||||
const lib = (mp_lib || '').toLowerCase();
|
const lib = (mp_lib || '').toLowerCase();
|
||||||
const os = String(props.$os || uaInfo.os || '').toLowerCase();
|
const os = String(props.$os || uaInfo.os || '').toLowerCase();
|
||||||
const browser = String(
|
const browser = String(
|
||||||
props.$browser || uaInfo.browser || '',
|
props.$browser || uaInfo.browser || ''
|
||||||
).toLowerCase();
|
).toLowerCase();
|
||||||
|
|
||||||
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
|
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
|
||||||
@@ -431,11 +623,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
return !this.isWebEvent(mp_lib);
|
return !this.isWebEvent(mp_lib);
|
||||||
}
|
}
|
||||||
|
|
||||||
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
|
|
||||||
const props = rawEvent.properties as Record<string, any>;
|
|
||||||
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
|
|
||||||
}
|
|
||||||
|
|
||||||
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
|
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
|
||||||
// For mobile events, extract device information from Mixpanel properties
|
// For mobile events, extract device information from Mixpanel properties
|
||||||
const os = props.$os || props.os || '';
|
const os = props.$os || props.os || '';
|
||||||
@@ -446,19 +633,19 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
isServer: true,
|
isServer: true,
|
||||||
os: os,
|
os,
|
||||||
osVersion: osVersion,
|
osVersion,
|
||||||
browser: '',
|
browser: '',
|
||||||
browserVersion: '',
|
browserVersion: '',
|
||||||
device: device,
|
device,
|
||||||
brand: brand,
|
brand,
|
||||||
model: model,
|
model,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private stripMixpanelProperties(
|
private stripMixpanelProperties(
|
||||||
properties: Record<string, any>,
|
properties: Record<string, any>,
|
||||||
searchParams: Record<string, string>,
|
searchParams: Record<string, string>
|
||||||
): Record<string, any> {
|
): Record<string, any> {
|
||||||
const strip = [
|
const strip = [
|
||||||
'time',
|
'time',
|
||||||
@@ -472,8 +659,8 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
|||||||
];
|
];
|
||||||
const filtered = Object.fromEntries(
|
const filtered = Object.fromEntries(
|
||||||
Object.entries(properties).filter(
|
Object.entries(properties).filter(
|
||||||
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key),
|
([key]) => !(key.match(/^(\$|mp_|utm_)/) || strip.includes(key))
|
||||||
),
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
// Parse JSON strings back to objects/arrays so toDots() can flatten them
|
// Parse JSON strings back to objects/arrays so toDots() can flatten them
|
||||||
|
|||||||
@@ -2,10 +2,13 @@ import { randomUUID } from 'node:crypto';
|
|||||||
import { Readable } from 'node:stream';
|
import { Readable } from 'node:stream';
|
||||||
import { pipeline } from 'node:stream/promises';
|
import { pipeline } from 'node:stream/promises';
|
||||||
import { createBrotliDecompress, createGunzip } from 'node:zlib';
|
import { createBrotliDecompress, createGunzip } from 'node:zlib';
|
||||||
import { isSameDomain, parsePath } from '@openpanel/common';
|
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
|
||||||
import { generateDeviceId } from '@openpanel/common/server';
|
import {
|
||||||
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
|
generateDeviceId,
|
||||||
import type { IClickhouseEvent } from '@openpanel/db';
|
getReferrerWithQuery,
|
||||||
|
parseReferrer,
|
||||||
|
} from '@openpanel/common/server';
|
||||||
|
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
|
||||||
import type { ILogger } from '@openpanel/logger';
|
import type { ILogger } from '@openpanel/logger';
|
||||||
import type { IUmamiImportConfig } from '@openpanel/validation';
|
import type { IUmamiImportConfig } from '@openpanel/validation';
|
||||||
import { parse } from 'csv-parse';
|
import { parse } from 'csv-parse';
|
||||||
@@ -63,7 +66,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
constructor(
|
constructor(
|
||||||
private readonly projectId: string,
|
private readonly projectId: string,
|
||||||
private readonly config: IUmamiImportConfig,
|
private readonly config: IUmamiImportConfig,
|
||||||
private readonly logger?: ILogger,
|
private readonly logger?: ILogger
|
||||||
) {
|
) {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
@@ -82,7 +85,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
signal?: AbortSignal;
|
signal?: AbortSignal;
|
||||||
maxBytes?: number;
|
maxBytes?: number;
|
||||||
maxRows?: number;
|
maxRows?: number;
|
||||||
} = {},
|
} = {}
|
||||||
): AsyncGenerator<UmamiRawEvent, void, unknown> {
|
): AsyncGenerator<UmamiRawEvent, void, unknown> {
|
||||||
const { signal, maxBytes, maxRows } = opts;
|
const { signal, maxBytes, maxRows } = opts;
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
@@ -95,9 +98,9 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const res = await fetch(url, { signal: controller.signal });
|
const res = await fetch(url, { signal: controller.signal });
|
||||||
if (!res.ok || !res.body) {
|
if (!(res.ok && res.body)) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to fetch remote file: ${res.status} ${res.statusText}`,
|
`Failed to fetch remote file: ${res.status} ${res.statusText}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,15 +111,15 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
if (
|
if (
|
||||||
contentType &&
|
contentType &&
|
||||||
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
|
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
|
||||||
contentType,
|
contentType
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
|
this.logger?.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maxBytes && contentLen && contentLen > maxBytes) {
|
if (maxBytes && contentLen && contentLen > maxBytes) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`,
|
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,9 +140,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
if (seenBytes > maxBytes) {
|
if (seenBytes > maxBytes) {
|
||||||
controller.abort();
|
controller.abort();
|
||||||
body.destroy(
|
body.destroy(
|
||||||
new Error(
|
new Error(`Stream exceeded size limit (${seenBytes} > ${maxBytes})`)
|
||||||
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
|
|
||||||
),
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -190,7 +191,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to parse remote file from ${url}: ${
|
`Failed to parse remote file from ${url}: ${
|
||||||
err instanceof Error ? err.message : String(err)
|
err instanceof Error ? err.message : String(err)
|
||||||
}`,
|
}`
|
||||||
);
|
);
|
||||||
} finally {
|
} finally {
|
||||||
controller.abort(); // ensure fetch stream is torn down
|
controller.abort(); // ensure fetch stream is torn down
|
||||||
@@ -205,7 +206,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
|
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
|
||||||
const projectId =
|
const projectId =
|
||||||
this.config.projectMapper.find(
|
this.config.projectMapper.find(
|
||||||
(mapper) => mapper.from === _rawEvent.website_id,
|
(mapper) => mapper.from === _rawEvent.website_id
|
||||||
)?.to || this.projectId;
|
)?.to || this.projectId;
|
||||||
|
|
||||||
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
|
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
|
||||||
@@ -261,39 +262,50 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add useful properties from Umami data
|
// Add useful properties from Umami data
|
||||||
if (rawEvent.page_title) properties.__title = rawEvent.page_title;
|
if (rawEvent.page_title) {
|
||||||
if (rawEvent.screen) properties.__screen = rawEvent.screen;
|
properties.__title = rawEvent.page_title;
|
||||||
if (rawEvent.language) properties.__language = rawEvent.language;
|
}
|
||||||
if (rawEvent.utm_source)
|
if (rawEvent.screen) {
|
||||||
|
properties.__screen = rawEvent.screen;
|
||||||
|
}
|
||||||
|
if (rawEvent.language) {
|
||||||
|
properties.__language = rawEvent.language;
|
||||||
|
}
|
||||||
|
if (rawEvent.utm_source) {
|
||||||
properties = assocPath(
|
properties = assocPath(
|
||||||
['__query', 'utm_source'],
|
['__query', 'utm_source'],
|
||||||
rawEvent.utm_source,
|
rawEvent.utm_source,
|
||||||
properties,
|
properties
|
||||||
);
|
);
|
||||||
if (rawEvent.utm_medium)
|
}
|
||||||
|
if (rawEvent.utm_medium) {
|
||||||
properties = assocPath(
|
properties = assocPath(
|
||||||
['__query', 'utm_medium'],
|
['__query', 'utm_medium'],
|
||||||
rawEvent.utm_medium,
|
rawEvent.utm_medium,
|
||||||
properties,
|
properties
|
||||||
);
|
);
|
||||||
if (rawEvent.utm_campaign)
|
}
|
||||||
|
if (rawEvent.utm_campaign) {
|
||||||
properties = assocPath(
|
properties = assocPath(
|
||||||
['__query', 'utm_campaign'],
|
['__query', 'utm_campaign'],
|
||||||
rawEvent.utm_campaign,
|
rawEvent.utm_campaign,
|
||||||
properties,
|
properties
|
||||||
);
|
);
|
||||||
if (rawEvent.utm_content)
|
}
|
||||||
|
if (rawEvent.utm_content) {
|
||||||
properties = assocPath(
|
properties = assocPath(
|
||||||
['__query', 'utm_content'],
|
['__query', 'utm_content'],
|
||||||
rawEvent.utm_content,
|
rawEvent.utm_content,
|
||||||
properties,
|
properties
|
||||||
);
|
);
|
||||||
if (rawEvent.utm_term)
|
}
|
||||||
|
if (rawEvent.utm_term) {
|
||||||
properties = assocPath(
|
properties = assocPath(
|
||||||
['__query', 'utm_term'],
|
['__query', 'utm_term'],
|
||||||
rawEvent.utm_term,
|
rawEvent.utm_term,
|
||||||
properties,
|
properties
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
id: rawEvent.event_id || randomUUID(),
|
id: rawEvent.event_id || randomUUID(),
|
||||||
@@ -302,8 +314,8 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
profile_id: profileId,
|
profile_id: profileId,
|
||||||
project_id: projectId,
|
project_id: projectId,
|
||||||
session_id: rawEvent.session_id || '',
|
session_id: rawEvent.session_id || '',
|
||||||
properties,
|
properties: toDots(properties),
|
||||||
created_at: rawEvent.created_at.toISOString(),
|
created_at: formatClickhouseDate(rawEvent.created_at),
|
||||||
country,
|
country,
|
||||||
city,
|
city,
|
||||||
region: this.mapRegion(region),
|
region: this.mapRegion(region),
|
||||||
@@ -329,7 +341,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
mapRegion(region: string): string {
|
mapRegion(region: string): string {
|
||||||
return region.replace(/^[A-Z]{2}\-/, '');
|
return region.replace(/^[A-Z]{2}-/, '');
|
||||||
}
|
}
|
||||||
|
|
||||||
mapDevice(device: string): string {
|
mapDevice(device: string): string {
|
||||||
|
|||||||
Reference in New Issue
Block a user