fix: redo how the importer works

This commit is contained in:
Carl-Gerhard Lindesvärd
2026-03-01 21:48:46 +01:00
parent 6251d143d1
commit 647ac2a4af
8 changed files with 993 additions and 984 deletions

View File

@@ -1,17 +1,16 @@
import { import {
type IClickhouseEvent,
type ImportSteps,
type Prisma,
backfillSessionsToProduction, backfillSessionsToProduction,
cleanupStagingData,
createSessionsStartEndEvents, createSessionsStartEndEvents,
db, db,
formatClickhouseDate, generateGapBasedSessionIds,
generateSessionIds,
getImportDateBounds, getImportDateBounds,
getImportProgress, type IClickhouseEvent,
type IClickhouseProfile,
insertImportBatch, insertImportBatch,
markImportComplete, insertProfilesBatch,
moveImportsToProduction, moveImportsToProduction,
type Prisma,
updateImportStatus, updateImportStatus,
} from '@openpanel/db'; } from '@openpanel/db';
import { MixpanelProvider, UmamiProvider } from '@openpanel/importer'; import { MixpanelProvider, UmamiProvider } from '@openpanel/importer';
@@ -22,294 +21,245 @@ import { logger } from '../utils/logger';
const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10); const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10);
/** function yieldToEventLoop(): Promise<void> {
* Yields control back to the event loop to prevent stalled jobs
*/
async function yieldToEventLoop(): Promise<void> {
return new Promise((resolve) => { return new Promise((resolve) => {
setTimeout(resolve, 100); setTimeout(resolve, 100);
}); });
} }
const PRODUCTION_STEPS = ['moving', 'backfilling_sessions'];
export async function importJob(job: Job<ImportQueuePayload>) { export async function importJob(job: Job<ImportQueuePayload>) {
const { importId } = job.data.payload; const { importId } = job.data.payload;
const record = await db.$primary().import.findUniqueOrThrow({ const record = await db.$primary().import.findUniqueOrThrow({
where: { id: importId }, where: { id: importId },
include: { include: { project: true },
project: true,
},
}); });
const jobLogger = logger.child({ const jobLogger = logger.child({ importId, config: record.config });
importId,
config: record.config,
});
type ValidStep = Exclude<ImportSteps, 'failed' | 'completed'>;
const steps: Record<ValidStep, number> = {
loading: 0,
generating_session_ids: 1,
creating_sessions: 2,
moving: 3,
backfilling_sessions: 4,
};
jobLogger.info('Starting import job'); jobLogger.info('Starting import job');
const providerInstance = createProvider(record, jobLogger); const providerInstance = createProvider(record, jobLogger);
const shouldGenerateSessionIds = providerInstance.shouldGenerateSessionIds();
try { try {
// Check if this is a resume operation const isRetry = record.currentStep !== null;
const isNewImport = record.currentStep === null; const hasReachedProduction =
isRetry && PRODUCTION_STEPS.includes(record.currentStep as string);
if (isNewImport) { // -------------------------------------------------------
await updateImportStatus(jobLogger, job, importId, { // STAGING PHASE: clean slate on failure, run from scratch
step: 'loading', // -------------------------------------------------------
}); if (!hasReachedProduction) {
} else { if (isRetry) {
jobLogger.info('Resuming import from previous state', { jobLogger.info(
currentStep: record.currentStep, 'Retry detected before production phase — cleaning staging data'
currentBatch: record.currentBatch, );
}); await cleanupStagingData(importId);
} }
// Try to get a precomputed total for better progress reporting // Phase 1: Load events into staging
await updateImportStatus(jobLogger, job, importId, { step: 'loading' });
const totalEvents = await providerInstance const totalEvents = await providerInstance
.getTotalEventsCount() .getTotalEventsCount()
.catch(() => -1); .catch(() => -1);
let processedEvents = record.processedEvents; let processedEvents = 0;
const eventBatch: IClickhouseEvent[] = [];
const resumeLoadingFrom = for await (const rawEvent of providerInstance.parseSource()) {
(record.currentStep === 'loading' && record.currentBatch) || undefined;
const resumeGeneratingSessionIdsFrom =
(record.currentStep === 'generating_session_ids' &&
record.currentBatch) ||
undefined;
const resumeCreatingSessionsFrom =
(record.currentStep === 'creating_sessions' && record.currentBatch) ||
undefined;
const resumeMovingFrom =
(record.currentStep === 'moving' && record.currentBatch) || undefined;
const resumeBackfillingSessionsFrom =
(record.currentStep === 'backfilling_sessions' && record.currentBatch) ||
undefined;
// Example:
// shouldRunStep(0) // currStep = 2 (should not run)
// shouldRunStep(1) // currStep = 2 (should not run)
// shouldRunStep(2) // currStep = 2 (should run)
// shouldRunStep(3) // currStep = 2 (should run)
const shouldRunStep = (step: ValidStep) => {
if (isNewImport) {
return true;
}
const stepToRunIndex = steps[step];
const currentStepIndex = steps[record.currentStep as ValidStep];
return stepToRunIndex >= currentStepIndex;
};
async function whileBounds(
from: string | undefined,
callback: (from: string, to: string) => Promise<void>,
) {
const bounds = await getImportDateBounds(importId, from);
if (bounds.min && bounds.max) {
const start = new Date(bounds.min);
const end = new Date(bounds.max);
let cursor = new Date(start);
while (cursor < end) {
const next = new Date(cursor);
next.setDate(next.getDate() + 1);
await callback(
formatClickhouseDate(cursor, true),
formatClickhouseDate(next, true),
);
cursor = next;
// Yield control back to event loop after processing each day
await yieldToEventLoop();
}
}
}
// Phase 1: Fetch & Transform - Process events in batches
if (shouldRunStep('loading')) {
const eventBatch: any = [];
for await (const rawEvent of providerInstance.parseSource(
resumeLoadingFrom,
)) {
// Validate event
if ( if (
!providerInstance.validate( !providerInstance.validate(
// @ts-expect-error // @ts-expect-error -- provider-specific raw type
rawEvent, rawEvent
) )
) { ) {
jobLogger.warn('Skipping invalid event', { rawEvent }); jobLogger.warn('Skipping invalid event', { rawEvent });
continue; continue;
} }
eventBatch.push(rawEvent); const transformed: IClickhouseEvent = providerInstance.transformEvent(
// @ts-expect-error -- provider-specific raw type
rawEvent
);
// Session IDs for providers that need them (e.g. Mixpanel) are generated
// in generateGapBasedSessionIds after loading, using gap-based logic.
eventBatch.push(transformed);
// Process batch when it reaches the batch size
if (eventBatch.length >= BATCH_SIZE) { if (eventBatch.length >= BATCH_SIZE) {
jobLogger.info('Processing batch', { batchSize: eventBatch.length }); await insertImportBatch(eventBatch, importId);
const transformedEvents: IClickhouseEvent[] = eventBatch.map(
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
processedEvents += eventBatch.length; processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '') const batchDate = new Date(eventBatch[0]?.created_at || '')
.toISOString() .toISOString()
.split('T')[0]; .split('T')[0];
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'loading', step: 'loading',
batch: createdAt, batch: batchDate,
totalEvents, totalEvents,
processedEvents, processedEvents,
}); });
// Yield control back to event loop after processing each batch eventBatch.length = 0;
await yieldToEventLoop(); await yieldToEventLoop();
} }
} }
// Process remaining events in the last batch
if (eventBatch.length > 0) { if (eventBatch.length > 0) {
const transformedEvents = eventBatch.map( await insertImportBatch(eventBatch, importId);
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
processedEvents += eventBatch.length; processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '') const batchDate = new Date(eventBatch[0]?.created_at || '')
.toISOString() .toISOString()
.split('T')[0]; .split('T')[0];
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'loading', step: 'loading',
batch: createdAt, batch: batchDate,
totalEvents, totalEvents,
processedEvents, processedEvents,
}); });
eventBatch.length = 0;
// Yield control back to event loop after processing final batch
await yieldToEventLoop();
}
} }
// Phase 2: Generate session IDs if provider requires it jobLogger.info('Loading complete', { processedEvents });
// Phase 1b: Load user profiles (Mixpanel only)
const profileBatchSize = 5000;
if ( if (
shouldRunStep('generating_session_ids') && 'streamProfiles' in providerInstance &&
providerInstance.shouldGenerateSessionIds() typeof (providerInstance as MixpanelProvider).streamProfiles ===
'function'
) { ) {
await whileBounds(resumeGeneratingSessionIdsFrom, async (from) => {
console.log('Generating session IDs', { from });
await generateSessionIds(importId, from);
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'generating_session_ids', step: 'loading_profiles',
batch: from,
}); });
// Yield control back to event loop after processing each day const profileBatch: IClickhouseProfile[] = [];
let processedProfiles = 0;
for await (const rawProfile of (
providerInstance as MixpanelProvider
).streamProfiles()) {
const profile = (
providerInstance as MixpanelProvider
).transformProfile(rawProfile);
profileBatch.push(profile);
if (profileBatch.length >= profileBatchSize) {
await insertProfilesBatch(profileBatch, record.projectId);
processedProfiles += profileBatch.length;
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
processedProfiles,
});
profileBatch.length = 0;
await yieldToEventLoop(); await yieldToEventLoop();
}); }
}
if (profileBatch.length > 0) {
await insertProfilesBatch(profileBatch, record.projectId);
processedProfiles += profileBatch.length;
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
processedProfiles,
totalProfiles: processedProfiles,
});
}
jobLogger.info('Profile loading complete', { processedProfiles });
}
// Phase 2: Generate gap-based session IDs (Mixpanel etc.)
if (shouldGenerateSessionIds) {
await updateImportStatus(jobLogger, job, importId, {
step: 'generating_sessions',
});
await generateGapBasedSessionIds(importId);
await yieldToEventLoop();
jobLogger.info('Session ID generation complete'); jobLogger.info('Session ID generation complete');
} }
// Phase 3-5: Process in daily batches for robustness // Phase 3: Create session_start / session_end events
if (shouldRunStep('creating_sessions')) {
await whileBounds(resumeCreatingSessionsFrom, async (from) => {
await createSessionsStartEndEvents(importId, from);
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'creating_sessions', step: 'creating_sessions',
batch: from, batch: 'all sessions',
}); });
await createSessionsStartEndEvents(importId);
// Yield control back to event loop after processing each day
await yieldToEventLoop(); await yieldToEventLoop();
});
jobLogger.info('Session event creation complete');
} }
if (shouldRunStep('moving')) { // -------------------------------------------------------
await whileBounds(resumeMovingFrom, async (from) => { // PRODUCTION PHASE: resume-safe, track progress per batch
await moveImportsToProduction(importId, from); // -------------------------------------------------------
// Phase 3: Move staging events to production (per-day)
const resumeMovingFrom =
hasReachedProduction && record.currentStep === 'moving'
? (record.currentBatch ?? undefined)
: undefined;
// currentBatch is the last successfully completed day — resume from the next day to avoid re-inserting it
const moveFromDate = (() => {
if (!resumeMovingFrom) return undefined;
const next = new Date(`${resumeMovingFrom}T12:00:00Z`);
next.setUTCDate(next.getUTCDate() + 1);
return next.toISOString().split('T')[0]!;
})();
const bounds = await getImportDateBounds(importId, moveFromDate);
if (bounds.min && bounds.max) {
const startDate = bounds.min.split(' ')[0]!;
const endDate = bounds.max.split(' ')[0]!;
const cursor = new Date(`${startDate}T12:00:00Z`);
const end = new Date(`${endDate}T12:00:00Z`);
while (cursor <= end) {
const dateStr = cursor.toISOString().split('T')[0]!;
await moveImportsToProduction(importId, dateStr);
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'moving', step: 'moving',
batch: from, batch: dateStr,
}); });
// Yield control back to event loop after processing each day
await yieldToEventLoop(); await yieldToEventLoop();
}); cursor.setUTCDate(cursor.getUTCDate() + 1);
}
} }
if (shouldRunStep('backfilling_sessions')) { jobLogger.info('Move to production complete');
await whileBounds(resumeBackfillingSessionsFrom, async (from) => {
await backfillSessionsToProduction(importId, from); // Phase 4: Backfill sessions table
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'backfilling_sessions', step: 'backfilling_sessions',
batch: from, batch: 'all sessions',
}); });
await backfillSessionsToProduction(importId);
// Yield control back to event loop after processing each day
await yieldToEventLoop(); await yieldToEventLoop();
});
}
await markImportComplete(importId); jobLogger.info('Session backfill complete');
await updateImportStatus(jobLogger, job, importId, {
step: 'completed',
});
jobLogger.info('Import marked as complete');
// Get final progress // Done
const finalProgress = await getImportProgress(importId); await updateImportStatus(jobLogger, job, importId, { step: 'completed' });
jobLogger.info('Import completed');
jobLogger.info('Import job completed successfully', { return { success: true };
totalEvents: finalProgress.totalEvents,
insertedEvents: finalProgress.insertedEvents,
status: finalProgress.status,
});
return {
success: true,
totalEvents: finalProgress.totalEvents,
processedEvents: finalProgress.insertedEvents,
};
} catch (error) { } catch (error) {
jobLogger.error('Import job failed', { error }); jobLogger.error('Import job failed', { error });
// Mark import as failed
try { try {
const errorMsg = error instanceof Error ? error.message : 'Unknown error'; const errorMsg = error instanceof Error ? error.message : 'Unknown error';
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'failed', step: 'failed',
errorMessage: errorMsg, errorMessage: errorMsg,
}); });
jobLogger.warn('Import marked as failed', { error: errorMsg });
} catch (markError) { } catch (markError) {
jobLogger.error('Failed to mark import as failed', { error, markError }); jobLogger.error('Failed to mark import as failed', { error, markError });
} }
@@ -320,7 +270,7 @@ export async function importJob(job: Job<ImportQueuePayload>) {
function createProvider( function createProvider(
record: Prisma.ImportGetPayload<{ include: { project: true } }>, record: Prisma.ImportGetPayload<{ include: { project: true } }>,
jobLogger: ILogger, jobLogger: ILogger
) { ) {
const config = record.config; const config = record.config;
switch (config.provider) { switch (config.provider) {

View File

@@ -1,6 +1,5 @@
export * from './src/prisma-client'; export * from './src/prisma-client';
export * from './src/clickhouse/client'; export * from './src/clickhouse/client';
export * from './src/clickhouse/csv';
export * from './src/sql-builder'; export * from './src/sql-builder';
export * from './src/services/chart.service'; export * from './src/services/chart.service';
export * from './src/engine'; export * from './src/engine';

View File

@@ -1,11 +1,9 @@
import { Readable } from 'node:stream';
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client'; import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
import { ClickHouseLogLevel, createClient } from '@clickhouse/client'; import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
import sqlstring from 'sqlstring';
import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config'; import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config';
import { createLogger } from '@openpanel/logger'; import { createLogger } from '@openpanel/logger';
import type { IInterval } from '@openpanel/validation'; import type { IInterval } from '@openpanel/validation';
import sqlstring from 'sqlstring';
export { createClient }; export { createClient };
@@ -68,8 +66,11 @@ export const TABLE_NAMES = {
* Non-clustered mode = self-hosted environments * Non-clustered mode = self-hosted environments
*/ */
export function isClickhouseClustered(): boolean { export function isClickhouseClustered(): boolean {
if (process.env.CLICKHOUSE_CLUSTER === 'true' || process.env.CLICKHOUSE_CLUSTER === '1') { if (
return true process.env.CLICKHOUSE_CLUSTER === 'true' ||
process.env.CLICKHOUSE_CLUSTER === '1'
) {
return true;
} }
return !( return !(
@@ -97,21 +98,21 @@ function getClickhouseSettings(): ClickHouseSettings {
return { return {
distributed_product_mode: 'allow', distributed_product_mode: 'allow',
date_time_input_format: 'best_effort', date_time_input_format: 'best_effort',
...(!process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN ...(process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
? { ? {}
: {
query_plan_convert_any_join_to_semi_or_anti_join: 0, query_plan_convert_any_join_to_semi_or_anti_join: 0,
} }),
: {}),
...additionalSettings, ...additionalSettings,
}; };
} }
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = { export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
max_open_connections: 30, max_open_connections: 30,
request_timeout: 300000, request_timeout: 300_000,
keep_alive: { keep_alive: {
enabled: true, enabled: true,
idle_socket_ttl: 60000, idle_socket_ttl: 60_000,
}, },
compression: { compression: {
request: true, request: true,
@@ -138,7 +139,7 @@ const cleanQuery = (query?: string) =>
export async function withRetry<T>( export async function withRetry<T>(
operation: () => Promise<T>, operation: () => Promise<T>,
maxRetries = 3, maxRetries = 3,
baseDelay = 500, baseDelay = 500
): Promise<T> { ): Promise<T> {
let lastError: Error | undefined; let lastError: Error | undefined;
@@ -162,7 +163,7 @@ export async function withRetry<T>(
`Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`, `Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`,
{ {
error: error.message, error: error.message,
}, }
); );
await new Promise((resolve) => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
continue; continue;
@@ -213,7 +214,7 @@ export const ch = new Proxy(originalCh, {
export async function chQueryWithMeta<T extends Record<string, any>>( export async function chQueryWithMeta<T extends Record<string, any>>(
query: string, query: string,
clickhouseSettings?: ClickHouseSettings, clickhouseSettings?: ClickHouseSettings
): Promise<ResponseJSON<T>> { ): Promise<ResponseJSON<T>> {
const start = Date.now(); const start = Date.now();
const res = await ch.query({ const res = await ch.query({
@@ -249,44 +250,16 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
return response; return response;
} }
export async function chInsertCSV(tableName: string, rows: string[]) {
try {
const now = performance.now();
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
const csvStream = Readable.from(rows.join('\n'), {
objectMode: false,
});
await ch.insert({
table: tableName,
values: csvStream,
format: 'CSV',
clickhouse_settings: {
format_csv_allow_double_quotes: 1,
format_csv_allow_single_quotes: 0,
},
});
logger.info('CSV Insert successful', {
elapsed: performance.now() - now,
rows: rows.length,
});
} catch (error) {
logger.error('CSV Insert failed:', error);
throw error;
}
}
export async function chQuery<T extends Record<string, any>>( export async function chQuery<T extends Record<string, any>>(
query: string, query: string,
clickhouseSettings?: ClickHouseSettings, clickhouseSettings?: ClickHouseSettings
): Promise<T[]> { ): Promise<T[]> {
return (await chQueryWithMeta<T>(query, clickhouseSettings)).data; return (await chQueryWithMeta<T>(query, clickhouseSettings)).data;
} }
export function formatClickhouseDate( export function formatClickhouseDate(
date: Date | string, date: Date | string,
skipTime = false, skipTime = false
): string { ): string {
if (skipTime) { if (skipTime) {
return new Date(date).toISOString().split('T')[0]!; return new Date(date).toISOString().split('T')[0]!;

View File

@@ -1,53 +0,0 @@
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
// Format: '{'key1':'value1','key2':'value2'}'
// Single quotes inside values must be escaped with backslash: \'
// We also need to escape newlines and control characters to prevent CSV parsing issues
const escapeMapValue = (str: string) => {
return str
.replace(/\\/g, '\\\\') // Escape backslashes first
.replace(/'/g, "\\'") // Escape single quotes
.replace(/\n/g, '\\n') // Escape newlines
.replace(/\r/g, '\\r') // Escape carriage returns
.replace(/\t/g, '\\t') // Escape tabs
.replace(/\0/g, '\\0'); // Escape null bytes
};
export const csvEscapeJson = (
value: Record<string, unknown> | null | undefined,
): string => {
if (value == null) return '';
// Normalize to strings if your column is Map(String,String)
const normalized: Record<string, string> = Object.fromEntries(
Object.entries(value).map(([k, v]) => [
String(k),
v == null ? '' : String(v),
]),
);
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
if (Object.keys(normalized).length === 0) return '{}';
const pairs = Object.entries(normalized)
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
.join(',');
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
return csvEscapeField(`{${pairs}}`);
};
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
// Double quotes inside must be doubled (""), per CSV standard
export const csvEscapeField = (value: string | number): string => {
const str = String(value);
// If field contains commas, quotes, or newlines, it must be quoted
if (/[,"\n\r]/.test(str)) {
// Escape double quotes by doubling them
const escaped = str.replace(/"/g, '""');
return `"${escaped}"`;
}
return str;
};

View File

@@ -1,15 +1,14 @@
import { createHash } from 'node:crypto';
import type { ILogger } from '@openpanel/logger'; import type { ILogger } from '@openpanel/logger';
import sqlstring from 'sqlstring';
import { import {
TABLE_NAMES,
ch, ch,
chInsertCSV,
convertClickhouseDateToJs, convertClickhouseDateToJs,
formatClickhouseDate, formatClickhouseDate,
getReplicatedTableName, getReplicatedTableName,
TABLE_NAMES,
} from '../clickhouse/client'; } from '../clickhouse/client';
import { csvEscapeField, csvEscapeJson } from '../clickhouse/csv'; import { db, type Prisma } from '../prisma-client';
import { type Prisma, db } from '../prisma-client'; import type { IClickhouseProfile } from './profile.service';
import type { IClickhouseEvent } from './event.service'; import type { IClickhouseEvent } from './event.service';
export interface ImportStageResult { export interface ImportStageResult {
@@ -18,11 +17,89 @@ export interface ImportStageResult {
insertedEvents: number; insertedEvents: number;
} }
export interface ImportProgress { const SESSION_GAP_MS = 30 * 60 * 1000; // 30 minutes
importId: string;
totalEvents: number; /**
insertedEvents: number; * Generate gap-based session IDs for events that have none.
status: 'pending' | 'processing' | 'processed' | 'failed'; * Streams events from staging (sorted by device_id, created_at), assigns a new
* session when gap > 30 min, re-inserts with session_id, then deletes old rows.
*/
export async function generateGapBasedSessionIds(
importId: string
): Promise<void> {
let currentDeviceId = '';
let currentSessionId = '';
let currentLastTime = 0;
let currentCounter = -1;
const BATCH_SIZE = 5000;
const batch: IClickhouseEvent[] = [];
const result = await ch.query({
query: `
SELECT id, name, sdk_name, sdk_version, device_id, profile_id, project_id,
session_id, path, origin, referrer, referrer_name, referrer_type,
duration, properties, created_at, country, city, region,
longitude, latitude, os, os_version, browser, browser_version,
device, brand, model, imported_at
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
AND session_id = ''
AND device != 'server'
ORDER BY device_id, created_at
`,
query_params: { importId },
format: 'JSONEachRow',
});
const stream = result.stream();
for await (const rows of stream) {
for (const row of rows) {
const event = row.json() as IClickhouseEvent;
const time = new Date(event.created_at).getTime();
if (event.device_id !== currentDeviceId) {
currentDeviceId = event.device_id;
currentSessionId = '';
currentLastTime = 0;
currentCounter = -1;
}
if (!currentSessionId || time - currentLastTime > SESSION_GAP_MS) {
currentCounter++;
currentSessionId = createHash('md5')
.update(`${event.device_id}-${currentCounter}`)
.digest('hex')
.toLowerCase();
}
currentLastTime = time;
event.session_id = currentSessionId;
batch.push(event);
if (batch.length >= BATCH_SIZE) {
await insertImportBatch(batch, importId);
batch.length = 0;
}
}
}
if (batch.length > 0) {
await insertImportBatch(batch, importId);
}
const mutationTable = getReplicatedTableName(TABLE_NAMES.events_imports);
await ch.command({
query: `ALTER TABLE ${mutationTable} DELETE
WHERE import_id = {importId:String}
AND session_id = ''
AND device != 'server'`,
query_params: { importId },
clickhouse_settings: {
wait_end_of_query: 1,
mutations_sync: '2',
send_progress_in_http_headers: 1,
http_headers_progress_interval_ms: '50000',
},
});
} }
/** /**
@@ -30,55 +107,26 @@ export interface ImportProgress {
*/ */
export async function insertImportBatch( export async function insertImportBatch(
events: IClickhouseEvent[], events: IClickhouseEvent[],
importId: string, importId: string
): Promise<ImportStageResult> { ): Promise<ImportStageResult> {
if (events.length === 0) { if (events.length === 0) {
return { importId, totalEvents: 0, insertedEvents: 0 }; return { importId, totalEvents: 0, insertedEvents: 0 };
} }
// Important to have same order as events_imports table const now = formatClickhouseDate(new Date());
// CSV format: properly quotes fields that need it const rows = events.map((event) => ({
const csvRows = events.map((event) => { ...event,
// Properties need to be converted to JSON for Map(String, String) import_id: importId,
// All fields must be CSV-escaped when joining with commas import_status: 'pending',
const fields = [ imported_at: event.imported_at || now,
csvEscapeField(event.id || ''), imported_at_meta: now,
csvEscapeField(event.name), }));
csvEscapeField(event.sdk_name || ''),
csvEscapeField(event.sdk_version || ''),
csvEscapeField(event.device_id || ''),
csvEscapeField(event.profile_id || ''),
csvEscapeField(event.project_id || ''),
csvEscapeField(event.session_id || ''),
csvEscapeField(event.path),
csvEscapeField(event.origin || ''),
csvEscapeField(event.referrer || ''),
csvEscapeField(event.referrer_name || ''),
csvEscapeField(event.referrer_type || ''),
csvEscapeField(event.duration ?? 0),
csvEscapeJson(event.properties),
csvEscapeField(event.created_at),
csvEscapeField(event.country || ''),
csvEscapeField(event.city || ''),
csvEscapeField(event.region || ''),
csvEscapeField(event.longitude != null ? event.longitude : '\\N'),
csvEscapeField(event.latitude != null ? event.latitude : '\\N'),
csvEscapeField(event.os || ''),
csvEscapeField(event.os_version || ''),
csvEscapeField(event.browser || ''),
csvEscapeField(event.browser_version || ''),
csvEscapeField(event.device || ''),
csvEscapeField(event.brand || ''),
csvEscapeField(event.model || ''),
csvEscapeField('\\N'), // imported_at (Nullable)
csvEscapeField(importId),
csvEscapeField('pending'), // import_status
csvEscapeField(formatClickhouseDate(new Date())), // imported_at_meta (DateTime, not DateTime64, so no milliseconds)
];
return fields.join(',');
});
await chInsertCSV(TABLE_NAMES.events_imports, csvRows); await ch.insert({
table: TABLE_NAMES.events_imports,
values: rows,
format: 'JSONEachRow',
});
return { return {
importId, importId,
@@ -88,44 +136,86 @@ export async function insertImportBatch(
} }
/** /**
* Generate deterministic session IDs for events that don't have them * Insert a batch of profiles into the production profiles table.
* Uses 30-minute time windows to create consistent session IDs across imports * Used by Mixpanel (and other providers) to import user profiles during an import job.
* Only processes events where device != 'server' and session_id = ''
*/ */
export async function generateSessionIds( export async function insertProfilesBatch(
importId: string, profiles: IClickhouseProfile[],
from: string, projectId: string
): Promise<void> { ): Promise<{ inserted: number }> {
const rangeWhere = [ if (profiles.length === 0) {
'import_id = {importId:String}', return { inserted: 0 };
"import_status = 'pending'", }
"device != 'server'",
"session_id = ''",
from ? 'toDate(created_at) = {from:String}' : '',
]
.filter(Boolean)
.join(' AND ');
// Use SQL to generate deterministic session IDs based on device_id + 30-min time windows const normalized = profiles.map((p) => ({
// This ensures same events always get same session IDs regardless of import order id: p.id,
// In clustered mode, we must use the replicated table for mutations project_id: projectId,
first_name: p.first_name ?? '',
last_name: p.last_name ?? '',
email: p.email ?? '',
avatar: p.avatar ?? '',
is_external: p.is_external ?? true,
properties: Object.fromEntries(
Object.entries(p.properties || {}).filter(
(kv): kv is [string, string] => kv[1] != null && kv[1] !== ''
)
) as Record<string, string>,
created_at: p.created_at,
}));
await ch.insert({
table: TABLE_NAMES.profiles,
values: normalized,
format: 'JSONEachRow',
});
return { inserted: normalized.length };
}
/**
* Delete all staging data for an import. Used to get a clean slate on retry
* when the failure happened before moving data to production.
*/
export async function cleanupStagingData(importId: string): Promise<void> {
const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports); const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports);
const updateQuery = `
ALTER TABLE ${mutationTableName}
UPDATE session_id = lower(hex(MD5(concat(
device_id,
'-',
toString(toInt64(toUnixTimestamp(created_at) / 1800))
))))
WHERE ${rangeWhere}
`;
await ch.command({ await ch.command({
query: updateQuery, query: `ALTER TABLE ${mutationTableName} DELETE WHERE import_id = {importId:String}`,
query_params: { importId, from }, query_params: { importId },
clickhouse_settings: { clickhouse_settings: {
wait_end_of_query: 1, wait_end_of_query: 1,
mutations_sync: '2', // Wait for mutation to complete on all replicas (critical!) mutations_sync: '2',
send_progress_in_http_headers: 1, send_progress_in_http_headers: 1,
http_headers_progress_interval_ms: '50000', http_headers_progress_interval_ms: '50000',
}, },
@@ -133,46 +223,69 @@ export async function generateSessionIds(
} }
/** /**
* Reconstruct sessions using SQL-based logic * Reconstruct sessions across ALL dates for the import.
* This identifies session boundaries and creates session_start/session_end events * Each session_id gets exactly one session_start and one session_end,
* session_start inherits all properties from the first event in the session * even if the session spans midnight.
* session_end inherits all properties from the last event in the session and calculates duration *
* Batches by fetching distinct session_ids first, then running the
* heavy aggregation only for that batch of IDs.
*/ */
export async function createSessionsStartEndEvents( export async function createSessionsStartEndEvents(
importId: string, importId: string
from: string,
): Promise<void> { ): Promise<void> {
// First, let's identify session boundaries and get first/last events for each session const SESSION_BATCH_SIZE = 5000;
const rangeWhere = [ let lastSessionId = '';
'import_id = {importId:String}',
"import_status = 'pending'", const baseWhere = [
"session_id != ''", // Only process events that have session IDs 'import_id = {importId:String}',
'toDate(created_at) = {from:String}', "session_id != ''",
] "name NOT IN ('session_start', 'session_end')",
.filter(Boolean) ].join(' AND ');
.join(' AND ');
while (true) {
const idsResult = await ch.query({
query: `
SELECT DISTINCT session_id
FROM ${TABLE_NAMES.events_imports}
WHERE ${baseWhere}
AND session_id > {lastSessionId:String}
ORDER BY session_id
LIMIT {limit:UInt32}
`,
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
format: 'JSONEachRow',
});
const idRows = (await idsResult.json()) as Array<{ session_id: string }>;
if (idRows.length === 0) {
break;
}
const sessionIds = idRows.map((r) => r.session_id);
// Use window functions to efficiently get first event (all fields) and last event (only changing fields)
// session_end only needs: properties, path, origin, created_at - the rest can be inherited from session_start
const sessionEventsQuery = ` const sessionEventsQuery = `
SELECT SELECT
device_id, device_id,
session_id, session_id,
project_id, project_id,
profile_id, if(
any(nullIf(profile_id, device_id)) IS NULL,
any(profile_id),
any(nullIf(profile_id, device_id))
) AS profile_id,
argMin((path, origin, referrer, referrer_name, referrer_type, properties, created_at, country, city, region, longitude, latitude, os, os_version, browser, browser_version, device, brand, model), created_at) AS first_event, argMin((path, origin, referrer, referrer_name, referrer_type, properties, created_at, country, city, region, longitude, latitude, os, os_version, browser, browser_version, device, brand, model), created_at) AS first_event,
argMax((path, origin, properties, created_at), created_at) AS last_event_fields, argMax((path, origin, properties, created_at), created_at) AS last_event_fields,
min(created_at) AS first_timestamp, min(created_at) AS first_timestamp,
max(created_at) AS last_timestamp max(created_at) AS last_timestamp
FROM ${TABLE_NAMES.events_imports} FROM ${TABLE_NAMES.events_imports}
WHERE ${rangeWhere} WHERE ${baseWhere}
AND name NOT IN ('session_start', 'session_end') AND session_id IN ({sessionIds:Array(String)})
GROUP BY session_id, device_id, project_id, profile_id GROUP BY session_id, device_id, project_id
`; `;
const sessionEventsResult = await ch.query({ const sessionEventsResult = await ch.query({
query: sessionEventsQuery, query: sessionEventsQuery,
query_params: { importId, from }, query_params: { importId, sessionIds },
format: 'JSONEachRow', format: 'JSONEachRow',
}); });
@@ -182,14 +295,11 @@ export async function createSessionsStartEndEvents(
project_id: string; project_id: string;
profile_id: string; profile_id: string;
first_event: [ first_event: [
// string, // id
// string, // name
string, // path string, // path
string, // origin string, // origin
string, // referrer string, // referrer
string, // referrer_name string, // referrer_name
string, // referrer_type string, // referrer_type
// number, // duration
Record<string, unknown>, // properties Record<string, unknown>, // properties
string, // created_at string, // created_at
string, // country string, // country
@@ -204,9 +314,6 @@ export async function createSessionsStartEndEvents(
string, // device string, // device
string, // brand string, // brand
string, // model string, // model
// string, // sdk_name
// string, // sdk_version
// string, // imported_at
]; ];
last_event_fields: [ last_event_fields: [
string, // path string, // path
@@ -218,22 +325,23 @@ export async function createSessionsStartEndEvents(
last_timestamp: string; last_timestamp: string;
}>; }>;
// Create session_start and session_end events
const sessionEvents: IClickhouseEvent[] = []; const sessionEvents: IClickhouseEvent[] = [];
const adjustTimestamp = (timestamp: string, offsetMs: number): string => {
const date = convertClickhouseDateToJs(timestamp);
date.setTime(date.getTime() + offsetMs);
return formatClickhouseDate(date);
};
for (const session of sessionData) { for (const session of sessionData) {
// Destructure first event tuple (all fields)
const [ const [
// firstId,
// firstName,
firstPath, firstPath,
firstOrigin, firstOrigin,
firstReferrer, firstReferrer,
firstReferrerName, firstReferrerName,
firstReferrerType, firstReferrerType,
// firstDuration,
firstProperties, firstProperties,
firstCreatedAt, _firstCreatedAt,
firstCountry, firstCountry,
firstCity, firstCity,
firstRegion, firstRegion,
@@ -246,31 +354,15 @@ export async function createSessionsStartEndEvents(
firstDevice, firstDevice,
firstBrand, firstBrand,
firstModel, firstModel,
// firstSdkName,
// firstSdkVersion,
// firstImportedAt,
] = session.first_event; ] = session.first_event;
// Destructure last event fields (only the changing ones) const [lastPath, lastOrigin, lastProperties, _lastCreatedAt] =
const [lastPath, lastOrigin, lastProperties, lastCreatedAt] =
session.last_event_fields; session.last_event_fields;
// Calculate duration in milliseconds
// Parse timestamps as Date objects to calculate duration
const firstTime = new Date(session.first_timestamp).getTime(); const firstTime = new Date(session.first_timestamp).getTime();
const lastTime = new Date(session.last_timestamp).getTime(); const lastTime = new Date(session.last_timestamp).getTime();
const durationMs = Math.max(0, lastTime - firstTime); // Ensure non-negative duration const durationMs = Math.max(0, lastTime - firstTime);
// Helper function to adjust timestamp by milliseconds without timezone conversion
const adjustTimestamp = (timestamp: string, offsetMs: number): string => {
// Parse the timestamp, adjust it, and format back to ClickHouse format
const date = convertClickhouseDateToJs(timestamp);
date.setTime(date.getTime() + offsetMs);
return formatClickhouseDate(date);
};
// Create session_start event - inherit everything from first event but change name
// Set created_at to 1 second before the first event
sessionEvents.push({ sessionEvents.push({
id: crypto.randomUUID(), id: crypto.randomUUID(),
name: 'session_start', name: 'session_start',
@@ -283,12 +375,12 @@ export async function createSessionsStartEndEvents(
referrer: firstReferrer, referrer: firstReferrer,
referrer_name: firstReferrerName, referrer_name: firstReferrerName,
referrer_type: firstReferrerType, referrer_type: firstReferrerType,
duration: 0, // session_start always has 0 duration duration: 0,
properties: firstProperties as Record< properties: firstProperties as Record<
string, string,
string | number | boolean | null | undefined string | number | boolean | null | undefined
>, >,
created_at: adjustTimestamp(session.first_timestamp, -1000), // 1 second before first event created_at: adjustTimestamp(session.first_timestamp, -1000),
country: firstCountry, country: firstCountry,
city: firstCity, city: firstCity,
region: firstRegion, region: firstRegion,
@@ -306,8 +398,6 @@ export async function createSessionsStartEndEvents(
sdk_version: '1.0.0', sdk_version: '1.0.0',
}); });
// Create session_end event - inherit most from session_start, but use last event's path, origin, properties
// Set created_at to 1 second after the last event
sessionEvents.push({ sessionEvents.push({
id: crypto.randomUUID(), id: crypto.randomUUID(),
name: 'session_end', name: 'session_end',
@@ -315,133 +405,74 @@ export async function createSessionsStartEndEvents(
profile_id: session.profile_id, profile_id: session.profile_id,
project_id: session.project_id, project_id: session.project_id,
session_id: session.session_id, session_id: session.session_id,
path: lastPath, // From last event path: lastPath,
origin: lastOrigin, // From last event origin: lastOrigin,
referrer: firstReferrer, // Same as session_start referrer: firstReferrer,
referrer_name: firstReferrerName, // Same as session_start referrer_name: firstReferrerName,
referrer_type: firstReferrerType, // Same as session_start referrer_type: firstReferrerType,
duration: durationMs, duration: durationMs,
properties: lastProperties as Record< properties: lastProperties as Record<
string, string,
string | number | boolean | null | undefined string | number | boolean | null | undefined
>, // From last event >,
created_at: adjustTimestamp(session.last_timestamp, 500), // 1 second after last event created_at: adjustTimestamp(session.last_timestamp, 1000),
country: firstCountry, // Same as session_start country: firstCountry,
city: firstCity, // Same as session_start city: firstCity,
region: firstRegion, // Same as session_start region: firstRegion,
longitude: firstLongitude, // Same as session_start longitude: firstLongitude,
latitude: firstLatitude, // Same as session_start latitude: firstLatitude,
os: firstOs, // Same as session_start os: firstOs,
os_version: firstOsVersion, // Same as session_start os_version: firstOsVersion,
browser: firstBrowser, // Same as session_start browser: firstBrowser,
browser_version: firstBrowserVersion, // Same as session_start browser_version: firstBrowserVersion,
device: firstDevice, // Same as session_start device: firstDevice,
brand: firstBrand, // Same as session_start brand: firstBrand,
model: firstModel, // Same as session_start model: firstModel,
imported_at: new Date().toISOString(), imported_at: new Date().toISOString(),
sdk_name: 'import-session-reconstruction', sdk_name: 'import-session-reconstruction',
sdk_version: '1.0.0', sdk_version: '1.0.0',
}); });
} }
// Insert session events into imports table
if (sessionEvents.length > 0) { if (sessionEvents.length > 0) {
await insertImportBatch(sessionEvents, importId); await insertImportBatch(sessionEvents, importId);
} }
lastSessionId = idRows[idRows.length - 1]!.session_id;
if (idRows.length < SESSION_BATCH_SIZE) {
break;
}
}
} }
/** /**
* Migrate all events from imports table to production events table * Move events from staging to production events table.
* This includes both original events and generated session events * Batched per-day using a simple date filter.
*/ */
export async function moveImportsToProduction( export async function moveImportsToProduction(
importId: string, importId: string,
from: string, from: string
): Promise<void> { ): Promise<void> {
// Build the WHERE clause for migration
// For session events (session_start/session_end), we don't filter by their created_at
// because they're created with adjusted timestamps (±1 second) that might fall outside
// the date range. Instead, we include them if their session_id has events in this range.
let whereClause = 'import_id = {importId:String}'; let whereClause = 'import_id = {importId:String}';
if (from) { if (from) {
whereClause += ` AND ( whereClause += ' AND toDate(created_at) = {from:String}';
(toDate(created_at) = {from:String}) OR
(
name IN ('session_start', 'session_end') AND
session_id IN (
SELECT DISTINCT session_id
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
AND toDate(created_at) = {from:String}
AND name NOT IN ('session_start', 'session_end')
)
)
)`;
} }
const migrationQuery = ` const migrationQuery = `
INSERT INTO ${TABLE_NAMES.events} ( INSERT INTO ${TABLE_NAMES.events} (
id, id, name, sdk_name, sdk_version, device_id, profile_id, project_id,
name, session_id, path, origin, referrer, referrer_name, referrer_type,
sdk_name, duration, properties, created_at, country, city, region,
sdk_version, longitude, latitude, os, os_version, browser, browser_version,
device_id, device, brand, model, imported_at
profile_id,
project_id,
session_id,
path,
origin,
referrer,
referrer_name,
referrer_type,
duration,
properties,
created_at,
country,
city,
region,
longitude,
latitude,
os,
os_version,
browser,
browser_version,
device,
brand,
model,
imported_at
) )
SELECT SELECT
id, id, name, sdk_name, sdk_version, device_id, profile_id, project_id,
name, session_id, path, origin, referrer, referrer_name, referrer_type,
sdk_name, duration, properties, created_at, country, city, region,
sdk_version, longitude, latitude, os, os_version, browser, browser_version,
device_id, device, brand, model, imported_at
profile_id,
project_id,
session_id,
path,
origin,
referrer,
referrer_name,
referrer_type,
duration,
properties,
created_at,
country,
city,
region,
longitude,
latitude,
os,
os_version,
browser,
browser_version,
device,
brand,
model,
imported_at
FROM ${TABLE_NAMES.events_imports} FROM ${TABLE_NAMES.events_imports}
WHERE ${whereClause} WHERE ${whereClause}
ORDER BY created_at ASC ORDER BY created_at ASC
@@ -452,60 +483,54 @@ export async function moveImportsToProduction(
query_params: { importId, from }, query_params: { importId, from },
clickhouse_settings: { clickhouse_settings: {
wait_end_of_query: 1, wait_end_of_query: 1,
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
send_progress_in_http_headers: 1, send_progress_in_http_headers: 1,
// The interval of sending these progress headers. Here it is less than 60s,
http_headers_progress_interval_ms: '50000', http_headers_progress_interval_ms: '50000',
}, },
}); });
} }
/**
* Aggregate sessions from staging into the sessions table.
* Runs across all dates so cross-midnight sessions become one row.
* Batches by session_ids to bound ClickHouse memory.
*/
export async function backfillSessionsToProduction( export async function backfillSessionsToProduction(
importId: string, importId: string
from: string,
): Promise<void> { ): Promise<void> {
// After migrating events, populate the sessions table based on the migrated sessions const SESSION_BATCH_SIZE = 5000;
// We detect all session_ids involved in this import from the imports table, let lastSessionId = '';
// then aggregate over the production events to construct session rows.
while (true) {
const idsResult = await ch.query({
query: `
SELECT DISTINCT session_id
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
AND session_id > {lastSessionId:String}
ORDER BY session_id
LIMIT {limit:UInt32}
`,
query_params: { importId, lastSessionId, limit: SESSION_BATCH_SIZE },
format: 'JSONEachRow',
});
const idRows = (await idsResult.json()) as Array<{ session_id: string }>;
if (idRows.length === 0) {
break;
}
const sessionIds = idRows.map((r) => r.session_id);
const sessionsInsertQuery = ` const sessionsInsertQuery = `
INSERT INTO ${TABLE_NAMES.sessions} ( INSERT INTO ${TABLE_NAMES.sessions} (
id, id, project_id, profile_id, device_id, created_at, ended_at,
project_id, is_bounce, entry_origin, entry_path, exit_origin, exit_path,
profile_id, screen_view_count, revenue, event_count, duration,
device_id, country, region, city, longitude, latitude,
created_at, device, brand, model, browser, browser_version, os, os_version,
ended_at, sign, version,
is_bounce, utm_medium, utm_source, utm_campaign, utm_content, utm_term,
entry_origin, referrer, referrer_name, referrer_type
entry_path,
exit_origin,
exit_path,
screen_view_count,
revenue,
event_count,
duration,
country,
region,
city,
longitude,
latitude,
device,
brand,
model,
browser,
browser_version,
os,
os_version,
sign,
version,
utm_medium,
utm_source,
utm_campaign,
utm_content,
utm_term,
referrer,
referrer_name,
referrer_type
) )
SELECT SELECT
any(e.session_id) as id, any(e.session_id) as id,
@@ -551,128 +576,34 @@ export async function backfillSessionsToProduction(
argMinIf(e.referrer_type, e.created_at, e.name = 'session_start') as referrer_type argMinIf(e.referrer_type, e.created_at, e.name = 'session_start') as referrer_type
FROM ${TABLE_NAMES.events_imports} e FROM ${TABLE_NAMES.events_imports} e
WHERE WHERE
e.import_id = ${sqlstring.escape(importId)} e.import_id = {importId:String}
AND e.session_id != '' AND e.session_id IN ({sessionIds:Array(String)})
AND (
(toDate(e.created_at) = ${sqlstring.escape(from)}) OR
(
e.name IN ('session_start', 'session_end') AND
e.session_id IN (
SELECT DISTINCT session_id
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = ${sqlstring.escape(importId)}
AND toDate(created_at) = ${sqlstring.escape(from)}
AND name NOT IN ('session_start', 'session_end')
)
)
)
GROUP BY e.session_id GROUP BY e.session_id
`; `;
await ch.command({ await ch.command({
query: sessionsInsertQuery, query: sessionsInsertQuery,
query_params: { importId, sessionIds },
clickhouse_settings: { clickhouse_settings: {
wait_end_of_query: 1, wait_end_of_query: 1,
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
send_progress_in_http_headers: 1, send_progress_in_http_headers: 1,
// The interval of sending these progress headers. Here it is less than 60s,
http_headers_progress_interval_ms: '50000', http_headers_progress_interval_ms: '50000',
}, },
}); });
lastSessionId = idRows[idRows.length - 1]!.session_id;
if (idRows.length < SESSION_BATCH_SIZE) {
break;
}
}
} }
/** /**
* Mark import as complete by updating status * Get min/max created_at for an import's staging data.
*/
export async function markImportComplete(importId: string): Promise<void> {
// In clustered mode, we must use the replicated table for mutations
const mutationTableName = getReplicatedTableName(TABLE_NAMES.events_imports);
const updateQuery = `
ALTER TABLE ${mutationTableName}
UPDATE import_status = 'processed'
WHERE import_id = {importId:String}
`;
await ch.command({
query: updateQuery,
query_params: { importId },
clickhouse_settings: {
wait_end_of_query: 1,
mutations_sync: '2', // Wait for mutation to complete
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
send_progress_in_http_headers: 1,
// The interval of sending these progress headers. Here it is less than 60s,
http_headers_progress_interval_ms: '50000',
},
});
}
/**
* Get import progress and status
*/
export async function getImportProgress(
importId: string,
): Promise<ImportProgress> {
const progressQuery = `
SELECT
import_id,
COUNT(*) as total_events,
COUNTIf(import_status = 'pending') as pending_events,
COUNTIf(import_status = 'processed') as processed_events,
any(import_status) as status
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
AND name NOT IN ('session_start', 'session_end')
GROUP BY import_id
`;
const result = await ch.query({
query: progressQuery,
query_params: { importId },
format: 'JSONEachRow',
});
const data = (await result.json()) as Array<{
import_id: string;
total_events: number;
pending_events: number;
processed_events: number;
status: string;
}>;
if (data.length === 0) {
return {
importId,
totalEvents: 0,
insertedEvents: 0,
status: 'pending',
};
}
const row = data[0];
if (!row) {
return {
importId,
totalEvents: 0,
insertedEvents: 0,
status: 'pending',
};
}
return {
importId,
totalEvents: row.total_events,
insertedEvents: row.processed_events,
status: row.status as 'pending' | 'processing' | 'processed' | 'failed',
};
}
/**
* Utility: get min/max created_at for an import
*/ */
export async function getImportDateBounds( export async function getImportDateBounds(
importId: string, importId: string,
fromCreatedAt?: string, fromCreatedAt?: string
): Promise<{ min: string | null; max: string | null }> { ): Promise<{ min: string | null; max: string | null }> {
const res = await ch.query({ const res = await ch.query({
query: ` query: `
@@ -697,10 +628,6 @@ export async function getImportDateBounds(
: { min: null, max: null }; : { min: null, max: null };
} }
/**
* Unified method to update all import status information
* Combines step, batch, progress, and status message updates
*/
export type UpdateImportStatusOptions = export type UpdateImportStatusOptions =
| { | {
step: 'loading'; step: 'loading';
@@ -709,13 +636,17 @@ export type UpdateImportStatusOptions =
processedEvents?: number; processedEvents?: number;
} }
| { | {
step: 'generating_session_ids'; step: 'loading_profiles';
batch?: string; processedProfiles?: number;
totalProfiles?: number;
} }
| { | {
step: 'creating_sessions'; step: 'creating_sessions';
batch?: string; batch?: string;
} }
| {
step: 'generating_sessions';
}
| { | {
step: 'moving'; step: 'moving';
batch?: string; batch?: string;
@@ -740,7 +671,7 @@ export async function updateImportStatus(
updateProgress: (progress: Record<string, any>) => void; updateProgress: (progress: Record<string, any>) => void;
}, },
importId: string, importId: string,
options: UpdateImportStatusOptions, options: UpdateImportStatusOptions
): Promise<void> { ): Promise<void> {
const data: Prisma.ImportUpdateInput = {}; const data: Prisma.ImportUpdateInput = {};
switch (options.step) { switch (options.step) {
@@ -754,27 +685,35 @@ export async function updateImportStatus(
data.totalEvents = options.totalEvents; data.totalEvents = options.totalEvents;
data.processedEvents = options.processedEvents; data.processedEvents = options.processedEvents;
break; break;
case 'generating_session_ids': case 'loading_profiles':
data.currentStep = 'generating_session_ids'; data.currentStep = 'loading_profiles';
data.currentBatch = options.batch; data.statusMessage =
data.statusMessage = options.batch options.processedProfiles != null && options.totalProfiles != null
? `Generating session IDs for ${options.batch}` ? `Importing user profiles (${options.processedProfiles} / ${options.totalProfiles})`
: 'Generating session IDs...'; : 'Importing user profiles...';
break; break;
case 'creating_sessions': case 'creating_sessions':
data.currentStep = 'creating_sessions'; data.currentStep = 'creating_sessions';
data.currentBatch = options.batch; data.currentBatch = options.batch;
data.statusMessage = `Creating sessions for ${options.batch}`; data.statusMessage = options.batch
? `Creating sessions (${options.batch})`
: 'Creating sessions...';
break;
case 'generating_sessions':
data.currentStep = 'generating_sessions';
data.statusMessage = 'Generating session IDs...';
break; break;
case 'moving': case 'moving':
data.currentStep = 'moving'; data.currentStep = 'moving';
data.currentBatch = options.batch; data.currentBatch = options.batch;
data.statusMessage = `Moving imports to production for ${options.batch}`; data.statusMessage = `Moving events to production (${options.batch})`;
break; break;
case 'backfilling_sessions': case 'backfilling_sessions':
data.currentStep = 'backfilling_sessions'; data.currentStep = 'backfilling_sessions';
data.currentBatch = options.batch; data.currentBatch = options.batch;
data.statusMessage = `Aggregating sessions for ${options.batch}`; data.statusMessage = options.batch
? `Aggregating sessions (${options.batch})`
: 'Aggregating sessions...';
break; break;
case 'completed': case 'completed':
data.status = 'completed'; data.status = 'completed';
@@ -787,6 +726,8 @@ export async function updateImportStatus(
data.statusMessage = 'Import failed'; data.statusMessage = 'Import failed';
data.errorMessage = options.errorMessage; data.errorMessage = options.errorMessage;
break; break;
default:
break;
} }
jobLogger.info('Import status update', data); jobLogger.info('Import status update', data);

View File

@@ -39,7 +39,7 @@ describe('mixpanel', () => {
const rawEvent = { const rawEvent = {
event: '$mp_web_page_view', event: '$mp_web_page_view',
properties: { properties: {
time: 1746097970, time: 1_746_097_970,
distinct_id: '$device:123', distinct_id: '$device:123',
$browser: 'Chrome', $browser: 'Chrome',
$browser_version: 135, $browser_version: 135,
@@ -53,7 +53,7 @@ describe('mixpanel', () => {
$insert_id: 'source_id', $insert_id: 'source_id',
$lib_version: '2.60.0', $lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com', $mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1746078175363, $mp_api_timestamp_ms: 1_746_078_175_363,
$mp_autocapture: true, $mp_autocapture: true,
$os: 'Android', $os: 'Android',
$referrer: 'https://google.com/', $referrer: 'https://google.com/',
@@ -71,7 +71,7 @@ describe('mixpanel', () => {
gclid: 'oqneoqow', gclid: 'oqneoqow',
mp_country_code: 'IN', mp_country_code: 'IN',
mp_lib: 'web', mp_lib: 'web',
mp_processing_time_ms: 1746078175546, mp_processing_time_ms: 1_746_078_175_546,
mp_sent_by_lib_version: '2.60.0', mp_sent_by_lib_version: '2.60.0',
utm_medium: 'cpc', utm_medium: 'cpc',
utm_source: 'google', utm_source: 'google',
@@ -101,7 +101,7 @@ describe('mixpanel', () => {
__title: __title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2', 'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
}, },
created_at: '2025-05-01T11:12:50.000Z', created_at: '2025-05-01 11:12:50',
country: 'IN', country: 'IN',
city: 'Mumbai', city: 'Mumbai',
region: 'Maharashtra', region: 'Maharashtra',
@@ -110,7 +110,7 @@ describe('mixpanel', () => {
os: 'Android', os: 'Android',
os_version: undefined, os_version: undefined,
browser: 'Chrome', browser: 'Chrome',
browser_version: '', browser_version: '135',
device: 'mobile', device: 'mobile',
brand: '', brand: '',
model: '', model: '',
@@ -141,7 +141,7 @@ describe('mixpanel', () => {
const rawEvent = { const rawEvent = {
event: 'custom_event', event: 'custom_event',
properties: { properties: {
time: 1746097970, time: 1_746_097_970,
distinct_id: '$device:123', distinct_id: '$device:123',
$device_id: '123', $device_id: '123',
$user_id: 'user123', $user_id: 'user123',
@@ -192,7 +192,7 @@ describe('mixpanel', () => {
const rawEvent = { const rawEvent = {
event: 'ec_search_error', event: 'ec_search_error',
properties: { properties: {
time: 1759947367, time: 1_759_947_367,
distinct_id: '3385916', distinct_id: '3385916',
$browser: 'Mobile Safari', $browser: 'Mobile Safari',
$browser_version: null, $browser_version: null,
@@ -207,7 +207,7 @@ describe('mixpanel', () => {
$insert_id: 'bclkaepeqcfuzt4v', $insert_id: 'bclkaepeqcfuzt4v',
$lib_version: '2.60.0', $lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com', $mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1759927570699, $mp_api_timestamp_ms: 1_759_927_570_699,
$os: 'iOS', $os: 'iOS',
$region: 'Karnataka', $region: 'Karnataka',
$screen_height: 852, $screen_height: 852,
@@ -225,7 +225,7 @@ describe('mixpanel', () => {
language: 'english', language: 'english',
mp_country_code: 'IN', mp_country_code: 'IN',
mp_lib: 'web', mp_lib: 'web',
mp_processing_time_ms: 1759927592421, mp_processing_time_ms: 1_759_927_592_421,
mp_sent_by_lib_version: '2.60.0', mp_sent_by_lib_version: '2.60.0',
os: 'web', os: 'web',
osVersion: osVersion:
@@ -249,15 +249,15 @@ describe('mixpanel', () => {
expect(res.id.length).toBeGreaterThan(30); expect(res.id.length).toBeGreaterThan(30);
expect(res.imported_at).toMatch( expect(res.imported_at).toMatch(
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/, /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
); );
expect(omit(['id', 'imported_at'], res)).toEqual({ expect(omit(['id', 'imported_at'], res)).toEqual({
brand: 'Apple', brand: 'Apple',
browser: 'GSA', browser: 'GSA',
browser_version: 'null', browser_version: '388.0.811331708',
city: 'Bengaluru', city: 'Bengaluru',
country: 'IN', country: 'IN',
created_at: '2025-10-08T18:16:07.000Z', created_at: '2025-10-08 18:16:07',
device: 'mobile', device: 'mobile',
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c', device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
duration: 0, duration: 0,

View File

@@ -1,8 +1,13 @@
import { randomUUID } from 'node:crypto'; import { randomUUID } from 'node:crypto';
import { isSameDomain, parsePath, toDots } from '@openpanel/common'; import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server'; import {
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server'; getReferrerWithQuery,
import type { IClickhouseEvent } from '@openpanel/db'; parseReferrer,
parseUserAgent,
type UserAgentInfo,
} from '@openpanel/common/server';
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
import type { IClickhouseProfile } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger'; import type { ILogger } from '@openpanel/logger';
import type { IMixpanelImportConfig } from '@openpanel/validation'; import type { IMixpanelImportConfig } from '@openpanel/validation';
import { z } from 'zod'; import { z } from 'zod';
@@ -15,22 +20,88 @@ export const zMixpanelRawEvent = z.object({
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>; export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
/** Engage API profile: https://docs.mixpanel.com/docs/export-methods#exporting-profiles */
export const zMixpanelRawProfile = z.object({
$distinct_id: z.union([z.string(), z.number()]),
$properties: z.record(z.unknown()).optional().default({}),
});
export type MixpanelRawProfile = z.infer<typeof zMixpanelRawProfile>;
class MixpanelRateLimitError extends Error {
readonly retryAfterMs?: number;
constructor(message: string, retryAfterMs?: number) {
super(message);
this.name = 'MixpanelRateLimitError';
this.retryAfterMs = retryAfterMs;
}
}
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> { export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
provider = 'mixpanel'; provider = 'mixpanel';
version = '1.0.0'; version = '1.0.0';
private static readonly MAX_REQUESTS_PER_HOUR = 100;
private static readonly MIN_REQUEST_INTERVAL_MS = 334; // 3 QPS limit
private requestTimestamps: number[] = [];
private lastRequestTime = 0;
constructor( constructor(
private readonly projectId: string, private readonly projectId: string,
private readonly config: IMixpanelImportConfig, private readonly config: IMixpanelImportConfig,
private readonly logger?: ILogger, private readonly logger?: ILogger
) { ) {
super(); super();
} }
async getTotalEventsCount(): Promise<number> { private async waitForRateLimit(): Promise<void> {
const now = Date.now();
const oneHourAgo = now - 60 * 60 * 1000;
// Prune timestamps older than 1 hour
this.requestTimestamps = this.requestTimestamps.filter(
(t) => t > oneHourAgo
);
// Enforce per-second limit (3 QPS → min 334ms gap)
const timeSinceLast = now - this.lastRequestTime;
if (timeSinceLast < MixpanelProvider.MIN_REQUEST_INTERVAL_MS) {
const delay = MixpanelProvider.MIN_REQUEST_INTERVAL_MS - timeSinceLast;
await new Promise((resolve) => setTimeout(resolve, delay));
}
// Enforce hourly limit
if (
this.requestTimestamps.length >= MixpanelProvider.MAX_REQUESTS_PER_HOUR
) {
const oldestInWindow = this.requestTimestamps[0]!;
const waitUntil = oldestInWindow + 60 * 60 * 1000;
const waitMs = waitUntil - Date.now() + 1000; // +1s buffer
if (waitMs > 0) {
this.logger?.info(
`Rate limit: ${this.requestTimestamps.length} requests in the last hour, waiting ${Math.ceil(waitMs / 1000)}s`,
{
requestsInWindow: this.requestTimestamps.length,
waitMs,
}
);
await new Promise((resolve) => setTimeout(resolve, waitMs));
// Prune again after waiting
this.requestTimestamps = this.requestTimestamps.filter(
(t) => t > Date.now() - 60 * 60 * 1000
);
}
}
this.lastRequestTime = Date.now();
this.requestTimestamps.push(Date.now());
}
getTotalEventsCount(): Promise<number> {
// Mixpanel sucks and dont provide a good way to extract total event count within a period // Mixpanel sucks and dont provide a good way to extract total event count within a period
// jql would work but not accurate and will be deprecated end of 2025 // jql would work but not accurate and will be deprecated end of 2025
return -1; return Promise.resolve(-1);
} }
/** /**
@@ -42,13 +113,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
} }
async *parseSource( async *parseSource(
overrideFrom?: string, overrideFrom?: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> { ): AsyncGenerator<MixpanelRawEvent, void, unknown> {
yield* this.fetchEventsFromMixpanel(overrideFrom); yield* this.fetchEventsFromMixpanel(overrideFrom);
} }
private async *fetchEventsFromMixpanel( private async *fetchEventsFromMixpanel(
overrideFrom?: string, overrideFrom?: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> { ): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const { serviceAccount, serviceSecret, projectId, from, to } = this.config; const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
@@ -58,20 +129,24 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
for (const [chunkFrom, chunkTo] of dateChunks) { for (const [chunkFrom, chunkTo] of dateChunks) {
let retries = 0; let retries = 0;
const maxRetries = 3; const maxRetries = 6;
while (retries <= maxRetries) { while (retries <= maxRetries) {
try { try {
await this.waitForRateLimit();
yield* this.fetchEventsForDateRange( yield* this.fetchEventsForDateRange(
serviceAccount, serviceAccount,
serviceSecret, serviceSecret,
projectId, projectId,
chunkFrom, chunkFrom,
chunkTo, chunkTo
); );
break; // Success, move to next chunk break; // Success, move to next chunk
} catch (error) { } catch (error) {
retries++; retries++;
const isRateLimit =
error instanceof MixpanelRateLimitError ||
(error instanceof Error && error.message.includes('429'));
const isLastRetry = retries > maxRetries; const isLastRetry = retries > maxRetries;
this.logger?.warn('Failed to fetch events for date range', { this.logger?.warn('Failed to fetch events for date range', {
@@ -80,22 +155,31 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
attempt: retries, attempt: retries,
maxRetries, maxRetries,
error: (error as Error).message, error: (error as Error).message,
isRateLimit,
willRetry: !isLastRetry, willRetry: !isLastRetry,
}); });
if (isLastRetry) { if (isLastRetry) {
// Final attempt failed, re-throw
throw new Error( throw new Error(
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`, `Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`
); );
} }
// Exponential backoff: wait before retrying let delay: number;
const delay = Math.min(1000 * 2 ** (retries - 1), 60_000); // Cap at 1 minute if (error instanceof MixpanelRateLimitError && error.retryAfterMs) {
delay = error.retryAfterMs;
} else if (isRateLimit) {
// 5min → 10min → 15min → 15min → 15min = 60min total
delay = Math.min(300_000 * 2 ** (retries - 1), 900_000);
} else {
delay = Math.min(1000 * 2 ** (retries - 1), 60_000);
}
this.logger?.info('Retrying after delay', { this.logger?.info('Retrying after delay', {
delayMs: delay, delayMs: delay,
chunkFrom, chunkFrom,
chunkTo, chunkTo,
isRateLimit,
}); });
await new Promise((resolve) => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
@@ -108,7 +192,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
serviceSecret: string, serviceSecret: string,
projectId: string, projectId: string,
from: string, from: string,
to: string, to: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> { ): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const url = 'https://data.mixpanel.com/api/2.0/export'; const url = 'https://data.mixpanel.com/api/2.0/export';
@@ -134,9 +218,18 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
}, },
}); });
if (response.status === 429) {
const retryAfter = response.headers.get('Retry-After');
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
throw new MixpanelRateLimitError(
'Mixpanel rate limit exceeded (429)',
retryAfterMs
);
}
if (!response.ok) { if (!response.ok) {
throw new Error( throw new Error(
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`, `Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`
); );
} }
@@ -153,7 +246,9 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
while (true) { while (true) {
const { done, value } = await reader.read(); const { done, value } = await reader.read();
if (done) break; if (done) {
break;
}
buffer += decoder.decode(value, { stream: true }); buffer += decoder.decode(value, { stream: true });
@@ -187,7 +282,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
{ {
line: buffer.substring(0, 100), line: buffer.substring(0, 100),
error, error,
}, }
); );
} }
} }
@@ -196,6 +291,114 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
} }
} }
/**
* Stream user profiles from Mixpanel Engage API.
* Paginates with page/page_size (5k per page) and yields each profile.
*/
async *streamProfiles(): AsyncGenerator<MixpanelRawProfile, void, unknown> {
const { serviceAccount, serviceSecret, projectId } = this.config;
const pageSize = 5000;
let page = 0;
while (true) {
await this.waitForRateLimit();
const url = `https://mixpanel.com/api/query/engage?project_id=${encodeURIComponent(projectId)}`;
const body = new URLSearchParams({
page: String(page),
page_size: String(pageSize),
});
this.logger?.info('Fetching profiles from Mixpanel Engage', {
page,
page_size: pageSize,
projectId,
});
const response = await fetch(url, {
method: 'POST',
headers: {
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
Accept: 'application/json',
'Content-Type': 'application/x-www-form-urlencoded',
},
body: body.toString(),
});
if (response.status === 429) {
const retryAfter = response.headers.get('Retry-After');
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
throw new MixpanelRateLimitError(
'Mixpanel rate limit exceeded (429)',
retryAfterMs
);
}
if (!response.ok) {
const text = await response.text();
throw new Error(
`Failed to fetch profiles from Mixpanel: ${response.status} ${response.statusText} - ${text}`
);
}
const data = (await response.json()) as {
results?: Array<{ $distinct_id: string | number; $properties?: Record<string, unknown> }>;
page?: number;
total?: number;
};
const results = data.results ?? [];
for (const row of results) {
const parsed = zMixpanelRawProfile.safeParse(row);
if (parsed.success) {
yield parsed.data;
} else {
this.logger?.warn('Skipping invalid Mixpanel profile', {
row: JSON.stringify(row).slice(0, 200),
});
}
}
if (results.length < pageSize) {
break;
}
page++;
}
}
/**
* Map Mixpanel Engage profile to OpenPanel IClickhouseProfile.
*/
transformProfile(raw: MixpanelRawProfile): IClickhouseProfile {
const parsed = zMixpanelRawProfile.parse(raw);
const props = (parsed.$properties || {}) as Record<string, unknown>;
const id = String(parsed.$distinct_id).replace(/^\$device:/, '');
const createdAt = props.$created
? formatClickhouseDate(new Date(String(props.$created)))
: formatClickhouseDate(new Date());
const properties: Record<string, string> = {};
const stripPrefix = /^\$/;
for (const [key, value] of Object.entries(props)) {
if (stripPrefix.test(key)) continue;
if (value == null) continue;
properties[key] = typeof value === 'object' ? JSON.stringify(value) : String(value);
}
return {
id,
project_id: this.projectId,
first_name: String(props.$first_name ?? ''),
last_name: String(props.$last_name ?? ''),
email: String(props.$email ?? ''),
avatar: String(props.$avatar ?? props.$image ?? ''),
properties,
created_at: createdAt,
is_external: true,
};
}
validate(rawEvent: MixpanelRawEvent): boolean { validate(rawEvent: MixpanelRawEvent): boolean {
const res = zMixpanelRawEvent.safeParse(rawEvent); const res = zMixpanelRawEvent.safeParse(rawEvent);
return res.success; return res.success;
@@ -208,7 +411,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
const deviceId = props.$device_id; const deviceId = props.$device_id;
const profileId = String(props.$user_id || props.distinct_id).replace( const profileId = String(props.$user_id || props.distinct_id).replace(
/^\$device:/, /^\$device:/,
'', ''
); );
// Build full URL from current_url and current_url_search (web only) // Build full URL from current_url and current_url_search (web only)
@@ -309,7 +512,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
project_id: projectId, project_id: projectId,
session_id: '', // Will be generated in SQL after import session_id: '', // Will be generated in SQL after import
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String) properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
created_at: new Date(props.time * 1000).toISOString(), created_at: formatClickhouseDate(new Date(props.time * 1000)),
country, country,
city, city,
region, region,
@@ -318,10 +521,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
os: uaInfo.os || props.$os, os: uaInfo.os || props.$os,
os_version: uaInfo.osVersion || props.$osVersion, os_version: uaInfo.osVersion || props.$osVersion,
browser: uaInfo.browser || props.$browser, browser: uaInfo.browser || props.$browser,
browser_version: browser_version: uaInfo.browserVersion || String(props.$browser_version ?? ''),
uaInfo.browserVersion || props.$browserVersion
? String(props.$browser_version)
: '',
device: this.getDeviceType(props.mp_lib, uaInfo, props), device: this.getDeviceType(props.mp_lib, uaInfo, props),
brand: uaInfo.brand || '', brand: uaInfo.brand || '',
model: uaInfo.model || '', model: uaInfo.model || '',
@@ -338,14 +538,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
sdk_version: this.version, sdk_version: this.version,
}; };
// TODO: Remove this
// Temporary fix for a client
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
event.name = 'screen_view';
event.path = isMightBeScreenView;
}
// TODO: Remove this // TODO: Remove this
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects) // This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
if (props.utm_source && !properties.__query?.utm_source) { if (props.utm_source && !properties.__query?.utm_source) {
@@ -371,13 +563,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
private getDeviceType( private getDeviceType(
mp_lib: string, mp_lib: string,
uaInfo: UserAgentInfo, uaInfo: UserAgentInfo,
props: Record<string, any>, props: Record<string, any>
) { ) {
// Normalize lib/os/browser data // Normalize lib/os/browser data
const lib = (mp_lib || '').toLowerCase(); const lib = (mp_lib || '').toLowerCase();
const os = String(props.$os || uaInfo.os || '').toLowerCase(); const os = String(props.$os || uaInfo.os || '').toLowerCase();
const browser = String( const browser = String(
props.$browser || uaInfo.browser || '', props.$browser || uaInfo.browser || ''
).toLowerCase(); ).toLowerCase();
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad'; const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
@@ -431,11 +623,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
return !this.isWebEvent(mp_lib); return !this.isWebEvent(mp_lib);
} }
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
const props = rawEvent.properties as Record<string, any>;
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
}
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo { private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
// For mobile events, extract device information from Mixpanel properties // For mobile events, extract device information from Mixpanel properties
const os = props.$os || props.os || ''; const os = props.$os || props.os || '';
@@ -446,19 +633,19 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
return { return {
isServer: true, isServer: true,
os: os, os,
osVersion: osVersion, osVersion,
browser: '', browser: '',
browserVersion: '', browserVersion: '',
device: device, device,
brand: brand, brand,
model: model, model,
}; };
} }
private stripMixpanelProperties( private stripMixpanelProperties(
properties: Record<string, any>, properties: Record<string, any>,
searchParams: Record<string, string>, searchParams: Record<string, string>
): Record<string, any> { ): Record<string, any> {
const strip = [ const strip = [
'time', 'time',
@@ -472,8 +659,8 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
]; ];
const filtered = Object.fromEntries( const filtered = Object.fromEntries(
Object.entries(properties).filter( Object.entries(properties).filter(
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key), ([key]) => !(key.match(/^(\$|mp_|utm_)/) || strip.includes(key))
), )
); );
// Parse JSON strings back to objects/arrays so toDots() can flatten them // Parse JSON strings back to objects/arrays so toDots() can flatten them

View File

@@ -2,10 +2,13 @@ import { randomUUID } from 'node:crypto';
import { Readable } from 'node:stream'; import { Readable } from 'node:stream';
import { pipeline } from 'node:stream/promises'; import { pipeline } from 'node:stream/promises';
import { createBrotliDecompress, createGunzip } from 'node:zlib'; import { createBrotliDecompress, createGunzip } from 'node:zlib';
import { isSameDomain, parsePath } from '@openpanel/common'; import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { generateDeviceId } from '@openpanel/common/server'; import {
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server'; generateDeviceId,
import type { IClickhouseEvent } from '@openpanel/db'; getReferrerWithQuery,
parseReferrer,
} from '@openpanel/common/server';
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger'; import type { ILogger } from '@openpanel/logger';
import type { IUmamiImportConfig } from '@openpanel/validation'; import type { IUmamiImportConfig } from '@openpanel/validation';
import { parse } from 'csv-parse'; import { parse } from 'csv-parse';
@@ -63,7 +66,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
constructor( constructor(
private readonly projectId: string, private readonly projectId: string,
private readonly config: IUmamiImportConfig, private readonly config: IUmamiImportConfig,
private readonly logger?: ILogger, private readonly logger?: ILogger
) { ) {
super(); super();
} }
@@ -82,7 +85,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
signal?: AbortSignal; signal?: AbortSignal;
maxBytes?: number; maxBytes?: number;
maxRows?: number; maxRows?: number;
} = {}, } = {}
): AsyncGenerator<UmamiRawEvent, void, unknown> { ): AsyncGenerator<UmamiRawEvent, void, unknown> {
const { signal, maxBytes, maxRows } = opts; const { signal, maxBytes, maxRows } = opts;
const controller = new AbortController(); const controller = new AbortController();
@@ -95,9 +98,9 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
} }
const res = await fetch(url, { signal: controller.signal }); const res = await fetch(url, { signal: controller.signal });
if (!res.ok || !res.body) { if (!(res.ok && res.body)) {
throw new Error( throw new Error(
`Failed to fetch remote file: ${res.status} ${res.statusText}`, `Failed to fetch remote file: ${res.status} ${res.statusText}`
); );
} }
@@ -108,15 +111,15 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
if ( if (
contentType && contentType &&
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test( !/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
contentType, contentType
) )
) { ) {
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`); this.logger?.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
} }
if (maxBytes && contentLen && contentLen > maxBytes) { if (maxBytes && contentLen && contentLen > maxBytes) {
throw new Error( throw new Error(
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`, `Remote file exceeds size limit (${contentLen} > ${maxBytes})`
); );
} }
@@ -137,9 +140,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
if (seenBytes > maxBytes) { if (seenBytes > maxBytes) {
controller.abort(); controller.abort();
body.destroy( body.destroy(
new Error( new Error(`Stream exceeded size limit (${seenBytes} > ${maxBytes})`)
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
),
); );
} }
}); });
@@ -190,7 +191,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
throw new Error( throw new Error(
`Failed to parse remote file from ${url}: ${ `Failed to parse remote file from ${url}: ${
err instanceof Error ? err.message : String(err) err instanceof Error ? err.message : String(err)
}`, }`
); );
} finally { } finally {
controller.abort(); // ensure fetch stream is torn down controller.abort(); // ensure fetch stream is torn down
@@ -205,7 +206,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent { transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
const projectId = const projectId =
this.config.projectMapper.find( this.config.projectMapper.find(
(mapper) => mapper.from === _rawEvent.website_id, (mapper) => mapper.from === _rawEvent.website_id
)?.to || this.projectId; )?.to || this.projectId;
const rawEvent = zUmamiRawEvent.parse(_rawEvent); const rawEvent = zUmamiRawEvent.parse(_rawEvent);
@@ -261,39 +262,50 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
} }
// Add useful properties from Umami data // Add useful properties from Umami data
if (rawEvent.page_title) properties.__title = rawEvent.page_title; if (rawEvent.page_title) {
if (rawEvent.screen) properties.__screen = rawEvent.screen; properties.__title = rawEvent.page_title;
if (rawEvent.language) properties.__language = rawEvent.language; }
if (rawEvent.utm_source) if (rawEvent.screen) {
properties.__screen = rawEvent.screen;
}
if (rawEvent.language) {
properties.__language = rawEvent.language;
}
if (rawEvent.utm_source) {
properties = assocPath( properties = assocPath(
['__query', 'utm_source'], ['__query', 'utm_source'],
rawEvent.utm_source, rawEvent.utm_source,
properties, properties
); );
if (rawEvent.utm_medium) }
if (rawEvent.utm_medium) {
properties = assocPath( properties = assocPath(
['__query', 'utm_medium'], ['__query', 'utm_medium'],
rawEvent.utm_medium, rawEvent.utm_medium,
properties, properties
); );
if (rawEvent.utm_campaign) }
if (rawEvent.utm_campaign) {
properties = assocPath( properties = assocPath(
['__query', 'utm_campaign'], ['__query', 'utm_campaign'],
rawEvent.utm_campaign, rawEvent.utm_campaign,
properties, properties
); );
if (rawEvent.utm_content) }
if (rawEvent.utm_content) {
properties = assocPath( properties = assocPath(
['__query', 'utm_content'], ['__query', 'utm_content'],
rawEvent.utm_content, rawEvent.utm_content,
properties, properties
); );
if (rawEvent.utm_term) }
if (rawEvent.utm_term) {
properties = assocPath( properties = assocPath(
['__query', 'utm_term'], ['__query', 'utm_term'],
rawEvent.utm_term, rawEvent.utm_term,
properties, properties
); );
}
return { return {
id: rawEvent.event_id || randomUUID(), id: rawEvent.event_id || randomUUID(),
@@ -302,8 +314,8 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
profile_id: profileId, profile_id: profileId,
project_id: projectId, project_id: projectId,
session_id: rawEvent.session_id || '', session_id: rawEvent.session_id || '',
properties, properties: toDots(properties),
created_at: rawEvent.created_at.toISOString(), created_at: formatClickhouseDate(rawEvent.created_at),
country, country,
city, city,
region: this.mapRegion(region), region: this.mapRegion(region),
@@ -329,7 +341,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
} }
mapRegion(region: string): string { mapRegion(region: string): string {
return region.replace(/^[A-Z]{2}\-/, ''); return region.replace(/^[A-Z]{2}-/, '');
} }
mapDevice(device: string): string { mapDevice(device: string): string {