fix: redo how the importer works

This commit is contained in:
Carl-Gerhard Lindesvärd
2026-03-01 21:48:46 +01:00
parent 6251d143d1
commit 647ac2a4af
8 changed files with 993 additions and 984 deletions

View File

@@ -1,17 +1,16 @@
import {
type IClickhouseEvent,
type ImportSteps,
type Prisma,
backfillSessionsToProduction,
cleanupStagingData,
createSessionsStartEndEvents,
db,
formatClickhouseDate,
generateSessionIds,
generateGapBasedSessionIds,
getImportDateBounds,
getImportProgress,
type IClickhouseEvent,
type IClickhouseProfile,
insertImportBatch,
markImportComplete,
insertProfilesBatch,
moveImportsToProduction,
type Prisma,
updateImportStatus,
} from '@openpanel/db';
import { MixpanelProvider, UmamiProvider } from '@openpanel/importer';
@@ -22,294 +21,245 @@ import { logger } from '../utils/logger';
const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10);
/**
* Yields control back to the event loop to prevent stalled jobs
*/
async function yieldToEventLoop(): Promise<void> {
function yieldToEventLoop(): Promise<void> {
return new Promise((resolve) => {
setTimeout(resolve, 100);
});
}
const PRODUCTION_STEPS = ['moving', 'backfilling_sessions'];
export async function importJob(job: Job<ImportQueuePayload>) {
const { importId } = job.data.payload;
const record = await db.$primary().import.findUniqueOrThrow({
where: { id: importId },
include: {
project: true,
},
include: { project: true },
});
const jobLogger = logger.child({
importId,
config: record.config,
});
type ValidStep = Exclude<ImportSteps, 'failed' | 'completed'>;
const steps: Record<ValidStep, number> = {
loading: 0,
generating_session_ids: 1,
creating_sessions: 2,
moving: 3,
backfilling_sessions: 4,
};
const jobLogger = logger.child({ importId, config: record.config });
jobLogger.info('Starting import job');
const providerInstance = createProvider(record, jobLogger);
const shouldGenerateSessionIds = providerInstance.shouldGenerateSessionIds();
try {
// Check if this is a resume operation
const isNewImport = record.currentStep === null;
const isRetry = record.currentStep !== null;
const hasReachedProduction =
isRetry && PRODUCTION_STEPS.includes(record.currentStep as string);
if (isNewImport) {
await updateImportStatus(jobLogger, job, importId, {
step: 'loading',
});
} else {
jobLogger.info('Resuming import from previous state', {
currentStep: record.currentStep,
currentBatch: record.currentBatch,
});
}
// Try to get a precomputed total for better progress reporting
const totalEvents = await providerInstance
.getTotalEventsCount()
.catch(() => -1);
let processedEvents = record.processedEvents;
const resumeLoadingFrom =
(record.currentStep === 'loading' && record.currentBatch) || undefined;
const resumeGeneratingSessionIdsFrom =
(record.currentStep === 'generating_session_ids' &&
record.currentBatch) ||
undefined;
const resumeCreatingSessionsFrom =
(record.currentStep === 'creating_sessions' && record.currentBatch) ||
undefined;
const resumeMovingFrom =
(record.currentStep === 'moving' && record.currentBatch) || undefined;
const resumeBackfillingSessionsFrom =
(record.currentStep === 'backfilling_sessions' && record.currentBatch) ||
undefined;
// Example:
// shouldRunStep(0) // currStep = 2 (should not run)
// shouldRunStep(1) // currStep = 2 (should not run)
// shouldRunStep(2) // currStep = 2 (should run)
// shouldRunStep(3) // currStep = 2 (should run)
const shouldRunStep = (step: ValidStep) => {
if (isNewImport) {
return true;
// -------------------------------------------------------
// STAGING PHASE: clean slate on failure, run from scratch
// -------------------------------------------------------
if (!hasReachedProduction) {
if (isRetry) {
jobLogger.info(
'Retry detected before production phase — cleaning staging data'
);
await cleanupStagingData(importId);
}
const stepToRunIndex = steps[step];
const currentStepIndex = steps[record.currentStep as ValidStep];
return stepToRunIndex >= currentStepIndex;
};
// Phase 1: Load events into staging
await updateImportStatus(jobLogger, job, importId, { step: 'loading' });
async function whileBounds(
from: string | undefined,
callback: (from: string, to: string) => Promise<void>,
) {
const bounds = await getImportDateBounds(importId, from);
if (bounds.min && bounds.max) {
const start = new Date(bounds.min);
const end = new Date(bounds.max);
let cursor = new Date(start);
while (cursor < end) {
const next = new Date(cursor);
next.setDate(next.getDate() + 1);
await callback(
formatClickhouseDate(cursor, true),
formatClickhouseDate(next, true),
);
cursor = next;
const totalEvents = await providerInstance
.getTotalEventsCount()
.catch(() => -1);
let processedEvents = 0;
const eventBatch: IClickhouseEvent[] = [];
// Yield control back to event loop after processing each day
await yieldToEventLoop();
}
}
}
// Phase 1: Fetch & Transform - Process events in batches
if (shouldRunStep('loading')) {
const eventBatch: any = [];
for await (const rawEvent of providerInstance.parseSource(
resumeLoadingFrom,
)) {
// Validate event
for await (const rawEvent of providerInstance.parseSource()) {
if (
!providerInstance.validate(
// @ts-expect-error
rawEvent,
// @ts-expect-error -- provider-specific raw type
rawEvent
)
) {
jobLogger.warn('Skipping invalid event', { rawEvent });
continue;
}
eventBatch.push(rawEvent);
const transformed: IClickhouseEvent = providerInstance.transformEvent(
// @ts-expect-error -- provider-specific raw type
rawEvent
);
// Session IDs for providers that need them (e.g. Mixpanel) are generated
// in generateGapBasedSessionIds after loading, using gap-based logic.
eventBatch.push(transformed);
// Process batch when it reaches the batch size
if (eventBatch.length >= BATCH_SIZE) {
jobLogger.info('Processing batch', { batchSize: eventBatch.length });
const transformedEvents: IClickhouseEvent[] = eventBatch.map(
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
await insertImportBatch(eventBatch, importId);
processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '')
const batchDate = new Date(eventBatch[0]?.created_at || '')
.toISOString()
.split('T')[0];
await updateImportStatus(jobLogger, job, importId, {
step: 'loading',
batch: createdAt,
batch: batchDate,
totalEvents,
processedEvents,
});
// Yield control back to event loop after processing each batch
eventBatch.length = 0;
await yieldToEventLoop();
}
}
// Process remaining events in the last batch
if (eventBatch.length > 0) {
const transformedEvents = eventBatch.map(
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
await insertImportBatch(eventBatch, importId);
processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '')
const batchDate = new Date(eventBatch[0]?.created_at || '')
.toISOString()
.split('T')[0];
await updateImportStatus(jobLogger, job, importId, {
step: 'loading',
batch: createdAt,
batch: batchDate,
totalEvents,
processedEvents,
});
eventBatch.length = 0;
}
// Yield control back to event loop after processing final batch
jobLogger.info('Loading complete', { processedEvents });
// Phase 1b: Load user profiles (Mixpanel only)
const profileBatchSize = 5000;
if (
'streamProfiles' in providerInstance &&
typeof (providerInstance as MixpanelProvider).streamProfiles ===
'function'
) {
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
});
const profileBatch: IClickhouseProfile[] = [];
let processedProfiles = 0;
for await (const rawProfile of (
providerInstance as MixpanelProvider
).streamProfiles()) {
const profile = (
providerInstance as MixpanelProvider
).transformProfile(rawProfile);
profileBatch.push(profile);
if (profileBatch.length >= profileBatchSize) {
await insertProfilesBatch(profileBatch, record.projectId);
processedProfiles += profileBatch.length;
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
processedProfiles,
});
profileBatch.length = 0;
await yieldToEventLoop();
}
}
if (profileBatch.length > 0) {
await insertProfilesBatch(profileBatch, record.projectId);
processedProfiles += profileBatch.length;
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
processedProfiles,
totalProfiles: processedProfiles,
});
}
jobLogger.info('Profile loading complete', { processedProfiles });
}
// Phase 2: Generate gap-based session IDs (Mixpanel etc.)
if (shouldGenerateSessionIds) {
await updateImportStatus(jobLogger, job, importId, {
step: 'generating_sessions',
});
await generateGapBasedSessionIds(importId);
await yieldToEventLoop();
jobLogger.info('Session ID generation complete');
}
// Phase 3: Create session_start / session_end events
await updateImportStatus(jobLogger, job, importId, {
step: 'creating_sessions',
batch: 'all sessions',
});
await createSessionsStartEndEvents(importId);
await yieldToEventLoop();
jobLogger.info('Session event creation complete');
}
// -------------------------------------------------------
// PRODUCTION PHASE: resume-safe, track progress per batch
// -------------------------------------------------------
// Phase 3: Move staging events to production (per-day)
const resumeMovingFrom =
hasReachedProduction && record.currentStep === 'moving'
? (record.currentBatch ?? undefined)
: undefined;
// currentBatch is the last successfully completed day — resume from the next day to avoid re-inserting it
const moveFromDate = (() => {
if (!resumeMovingFrom) return undefined;
const next = new Date(`${resumeMovingFrom}T12:00:00Z`);
next.setUTCDate(next.getUTCDate() + 1);
return next.toISOString().split('T')[0]!;
})();
const bounds = await getImportDateBounds(importId, moveFromDate);
if (bounds.min && bounds.max) {
const startDate = bounds.min.split(' ')[0]!;
const endDate = bounds.max.split(' ')[0]!;
const cursor = new Date(`${startDate}T12:00:00Z`);
const end = new Date(`${endDate}T12:00:00Z`);
while (cursor <= end) {
const dateStr = cursor.toISOString().split('T')[0]!;
await moveImportsToProduction(importId, dateStr);
await updateImportStatus(jobLogger, job, importId, {
step: 'moving',
batch: dateStr,
});
await yieldToEventLoop();
cursor.setUTCDate(cursor.getUTCDate() + 1);
}
}
// Phase 2: Generate session IDs if provider requires it
if (
shouldRunStep('generating_session_ids') &&
providerInstance.shouldGenerateSessionIds()
) {
await whileBounds(resumeGeneratingSessionIdsFrom, async (from) => {
console.log('Generating session IDs', { from });
await generateSessionIds(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'generating_session_ids',
batch: from,
});
jobLogger.info('Move to production complete');
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
jobLogger.info('Session ID generation complete');
}
// Phase 3-5: Process in daily batches for robustness
if (shouldRunStep('creating_sessions')) {
await whileBounds(resumeCreatingSessionsFrom, async (from) => {
await createSessionsStartEndEvents(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'creating_sessions',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
if (shouldRunStep('moving')) {
await whileBounds(resumeMovingFrom, async (from) => {
await moveImportsToProduction(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'moving',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
if (shouldRunStep('backfilling_sessions')) {
await whileBounds(resumeBackfillingSessionsFrom, async (from) => {
await backfillSessionsToProduction(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'backfilling_sessions',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
await markImportComplete(importId);
// Phase 4: Backfill sessions table
await updateImportStatus(jobLogger, job, importId, {
step: 'completed',
step: 'backfilling_sessions',
batch: 'all sessions',
});
jobLogger.info('Import marked as complete');
await backfillSessionsToProduction(importId);
await yieldToEventLoop();
// Get final progress
const finalProgress = await getImportProgress(importId);
jobLogger.info('Session backfill complete');
jobLogger.info('Import job completed successfully', {
totalEvents: finalProgress.totalEvents,
insertedEvents: finalProgress.insertedEvents,
status: finalProgress.status,
});
// Done
await updateImportStatus(jobLogger, job, importId, { step: 'completed' });
jobLogger.info('Import completed');
return {
success: true,
totalEvents: finalProgress.totalEvents,
processedEvents: finalProgress.insertedEvents,
};
return { success: true };
} catch (error) {
jobLogger.error('Import job failed', { error });
// Mark import as failed
try {
const errorMsg = error instanceof Error ? error.message : 'Unknown error';
await updateImportStatus(jobLogger, job, importId, {
step: 'failed',
errorMessage: errorMsg,
});
jobLogger.warn('Import marked as failed', { error: errorMsg });
} catch (markError) {
jobLogger.error('Failed to mark import as failed', { error, markError });
}
@@ -320,7 +270,7 @@ export async function importJob(job: Job<ImportQueuePayload>) {
function createProvider(
record: Prisma.ImportGetPayload<{ include: { project: true } }>,
jobLogger: ILogger,
jobLogger: ILogger
) {
const config = record.config;
switch (config.provider) {

View File

@@ -1,6 +1,5 @@
export * from './src/prisma-client';
export * from './src/clickhouse/client';
export * from './src/clickhouse/csv';
export * from './src/sql-builder';
export * from './src/services/chart.service';
export * from './src/engine';

View File

@@ -1,11 +1,9 @@
import { Readable } from 'node:stream';
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
import sqlstring from 'sqlstring';
import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config';
import { createLogger } from '@openpanel/logger';
import type { IInterval } from '@openpanel/validation';
import sqlstring from 'sqlstring';
export { createClient };
@@ -68,8 +66,11 @@ export const TABLE_NAMES = {
* Non-clustered mode = self-hosted environments
*/
export function isClickhouseClustered(): boolean {
if (process.env.CLICKHOUSE_CLUSTER === 'true' || process.env.CLICKHOUSE_CLUSTER === '1') {
return true
if (
process.env.CLICKHOUSE_CLUSTER === 'true' ||
process.env.CLICKHOUSE_CLUSTER === '1'
) {
return true;
}
return !(
@@ -97,21 +98,21 @@ function getClickhouseSettings(): ClickHouseSettings {
return {
distributed_product_mode: 'allow',
date_time_input_format: 'best_effort',
...(!process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
? {
...(process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
? {}
: {
query_plan_convert_any_join_to_semi_or_anti_join: 0,
}
: {}),
}),
...additionalSettings,
};
}
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
max_open_connections: 30,
request_timeout: 300000,
request_timeout: 300_000,
keep_alive: {
enabled: true,
idle_socket_ttl: 60000,
idle_socket_ttl: 60_000,
},
compression: {
request: true,
@@ -138,7 +139,7 @@ const cleanQuery = (query?: string) =>
export async function withRetry<T>(
operation: () => Promise<T>,
maxRetries = 3,
baseDelay = 500,
baseDelay = 500
): Promise<T> {
let lastError: Error | undefined;
@@ -162,7 +163,7 @@ export async function withRetry<T>(
`Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`,
{
error: error.message,
},
}
);
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
@@ -213,7 +214,7 @@ export const ch = new Proxy(originalCh, {
export async function chQueryWithMeta<T extends Record<string, any>>(
query: string,
clickhouseSettings?: ClickHouseSettings,
clickhouseSettings?: ClickHouseSettings
): Promise<ResponseJSON<T>> {
const start = Date.now();
const res = await ch.query({
@@ -249,44 +250,16 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
return response;
}
export async function chInsertCSV(tableName: string, rows: string[]) {
try {
const now = performance.now();
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
const csvStream = Readable.from(rows.join('\n'), {
objectMode: false,
});
await ch.insert({
table: tableName,
values: csvStream,
format: 'CSV',
clickhouse_settings: {
format_csv_allow_double_quotes: 1,
format_csv_allow_single_quotes: 0,
},
});
logger.info('CSV Insert successful', {
elapsed: performance.now() - now,
rows: rows.length,
});
} catch (error) {
logger.error('CSV Insert failed:', error);
throw error;
}
}
export async function chQuery<T extends Record<string, any>>(
query: string,
clickhouseSettings?: ClickHouseSettings,
clickhouseSettings?: ClickHouseSettings
): Promise<T[]> {
return (await chQueryWithMeta<T>(query, clickhouseSettings)).data;
}
export function formatClickhouseDate(
date: Date | string,
skipTime = false,
skipTime = false
): string {
if (skipTime) {
return new Date(date).toISOString().split('T')[0]!;

View File

@@ -1,53 +0,0 @@
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
// Format: '{'key1':'value1','key2':'value2'}'
// Single quotes inside values must be escaped with backslash: \'
// We also need to escape newlines and control characters to prevent CSV parsing issues
const escapeMapValue = (str: string) => {
return str
.replace(/\\/g, '\\\\') // Escape backslashes first
.replace(/'/g, "\\'") // Escape single quotes
.replace(/\n/g, '\\n') // Escape newlines
.replace(/\r/g, '\\r') // Escape carriage returns
.replace(/\t/g, '\\t') // Escape tabs
.replace(/\0/g, '\\0'); // Escape null bytes
};
export const csvEscapeJson = (
value: Record<string, unknown> | null | undefined,
): string => {
if (value == null) return '';
// Normalize to strings if your column is Map(String,String)
const normalized: Record<string, string> = Object.fromEntries(
Object.entries(value).map(([k, v]) => [
String(k),
v == null ? '' : String(v),
]),
);
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
if (Object.keys(normalized).length === 0) return '{}';
const pairs = Object.entries(normalized)
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
.join(',');
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
return csvEscapeField(`{${pairs}}`);
};
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
// Double quotes inside must be doubled (""), per CSV standard
export const csvEscapeField = (value: string | number): string => {
const str = String(value);
// If field contains commas, quotes, or newlines, it must be quoted
if (/[,"\n\r]/.test(str)) {
// Escape double quotes by doubling them
const escaped = str.replace(/"/g, '""');
return `"${escaped}"`;
}
return str;
};

File diff suppressed because it is too large Load Diff

View File

@@ -39,7 +39,7 @@ describe('mixpanel', () => {
const rawEvent = {
event: '$mp_web_page_view',
properties: {
time: 1746097970,
time: 1_746_097_970,
distinct_id: '$device:123',
$browser: 'Chrome',
$browser_version: 135,
@@ -53,7 +53,7 @@ describe('mixpanel', () => {
$insert_id: 'source_id',
$lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1746078175363,
$mp_api_timestamp_ms: 1_746_078_175_363,
$mp_autocapture: true,
$os: 'Android',
$referrer: 'https://google.com/',
@@ -71,7 +71,7 @@ describe('mixpanel', () => {
gclid: 'oqneoqow',
mp_country_code: 'IN',
mp_lib: 'web',
mp_processing_time_ms: 1746078175546,
mp_processing_time_ms: 1_746_078_175_546,
mp_sent_by_lib_version: '2.60.0',
utm_medium: 'cpc',
utm_source: 'google',
@@ -101,7 +101,7 @@ describe('mixpanel', () => {
__title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
},
created_at: '2025-05-01T11:12:50.000Z',
created_at: '2025-05-01 11:12:50',
country: 'IN',
city: 'Mumbai',
region: 'Maharashtra',
@@ -110,7 +110,7 @@ describe('mixpanel', () => {
os: 'Android',
os_version: undefined,
browser: 'Chrome',
browser_version: '',
browser_version: '135',
device: 'mobile',
brand: '',
model: '',
@@ -141,7 +141,7 @@ describe('mixpanel', () => {
const rawEvent = {
event: 'custom_event',
properties: {
time: 1746097970,
time: 1_746_097_970,
distinct_id: '$device:123',
$device_id: '123',
$user_id: 'user123',
@@ -192,7 +192,7 @@ describe('mixpanel', () => {
const rawEvent = {
event: 'ec_search_error',
properties: {
time: 1759947367,
time: 1_759_947_367,
distinct_id: '3385916',
$browser: 'Mobile Safari',
$browser_version: null,
@@ -207,7 +207,7 @@ describe('mixpanel', () => {
$insert_id: 'bclkaepeqcfuzt4v',
$lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1759927570699,
$mp_api_timestamp_ms: 1_759_927_570_699,
$os: 'iOS',
$region: 'Karnataka',
$screen_height: 852,
@@ -225,7 +225,7 @@ describe('mixpanel', () => {
language: 'english',
mp_country_code: 'IN',
mp_lib: 'web',
mp_processing_time_ms: 1759927592421,
mp_processing_time_ms: 1_759_927_592_421,
mp_sent_by_lib_version: '2.60.0',
os: 'web',
osVersion:
@@ -249,15 +249,15 @@ describe('mixpanel', () => {
expect(res.id.length).toBeGreaterThan(30);
expect(res.imported_at).toMatch(
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
);
expect(omit(['id', 'imported_at'], res)).toEqual({
brand: 'Apple',
browser: 'GSA',
browser_version: 'null',
browser_version: '388.0.811331708',
city: 'Bengaluru',
country: 'IN',
created_at: '2025-10-08T18:16:07.000Z',
created_at: '2025-10-08 18:16:07',
device: 'mobile',
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
duration: 0,

View File

@@ -1,8 +1,13 @@
import { randomUUID } from 'node:crypto';
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server';
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
import type { IClickhouseEvent } from '@openpanel/db';
import {
getReferrerWithQuery,
parseReferrer,
parseUserAgent,
type UserAgentInfo,
} from '@openpanel/common/server';
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
import type { IClickhouseProfile } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { IMixpanelImportConfig } from '@openpanel/validation';
import { z } from 'zod';
@@ -15,22 +20,88 @@ export const zMixpanelRawEvent = z.object({
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
/** Engage API profile: https://docs.mixpanel.com/docs/export-methods#exporting-profiles */
export const zMixpanelRawProfile = z.object({
$distinct_id: z.union([z.string(), z.number()]),
$properties: z.record(z.unknown()).optional().default({}),
});
export type MixpanelRawProfile = z.infer<typeof zMixpanelRawProfile>;
class MixpanelRateLimitError extends Error {
readonly retryAfterMs?: number;
constructor(message: string, retryAfterMs?: number) {
super(message);
this.name = 'MixpanelRateLimitError';
this.retryAfterMs = retryAfterMs;
}
}
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
provider = 'mixpanel';
version = '1.0.0';
private static readonly MAX_REQUESTS_PER_HOUR = 100;
private static readonly MIN_REQUEST_INTERVAL_MS = 334; // 3 QPS limit
private requestTimestamps: number[] = [];
private lastRequestTime = 0;
constructor(
private readonly projectId: string,
private readonly config: IMixpanelImportConfig,
private readonly logger?: ILogger,
private readonly logger?: ILogger
) {
super();
}
async getTotalEventsCount(): Promise<number> {
private async waitForRateLimit(): Promise<void> {
const now = Date.now();
const oneHourAgo = now - 60 * 60 * 1000;
// Prune timestamps older than 1 hour
this.requestTimestamps = this.requestTimestamps.filter(
(t) => t > oneHourAgo
);
// Enforce per-second limit (3 QPS → min 334ms gap)
const timeSinceLast = now - this.lastRequestTime;
if (timeSinceLast < MixpanelProvider.MIN_REQUEST_INTERVAL_MS) {
const delay = MixpanelProvider.MIN_REQUEST_INTERVAL_MS - timeSinceLast;
await new Promise((resolve) => setTimeout(resolve, delay));
}
// Enforce hourly limit
if (
this.requestTimestamps.length >= MixpanelProvider.MAX_REQUESTS_PER_HOUR
) {
const oldestInWindow = this.requestTimestamps[0]!;
const waitUntil = oldestInWindow + 60 * 60 * 1000;
const waitMs = waitUntil - Date.now() + 1000; // +1s buffer
if (waitMs > 0) {
this.logger?.info(
`Rate limit: ${this.requestTimestamps.length} requests in the last hour, waiting ${Math.ceil(waitMs / 1000)}s`,
{
requestsInWindow: this.requestTimestamps.length,
waitMs,
}
);
await new Promise((resolve) => setTimeout(resolve, waitMs));
// Prune again after waiting
this.requestTimestamps = this.requestTimestamps.filter(
(t) => t > Date.now() - 60 * 60 * 1000
);
}
}
this.lastRequestTime = Date.now();
this.requestTimestamps.push(Date.now());
}
getTotalEventsCount(): Promise<number> {
// Mixpanel sucks and dont provide a good way to extract total event count within a period
// jql would work but not accurate and will be deprecated end of 2025
return -1;
return Promise.resolve(-1);
}
/**
@@ -42,13 +113,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
}
async *parseSource(
overrideFrom?: string,
overrideFrom?: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
yield* this.fetchEventsFromMixpanel(overrideFrom);
}
private async *fetchEventsFromMixpanel(
overrideFrom?: string,
overrideFrom?: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
@@ -58,20 +129,24 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
for (const [chunkFrom, chunkTo] of dateChunks) {
let retries = 0;
const maxRetries = 3;
const maxRetries = 6;
while (retries <= maxRetries) {
try {
await this.waitForRateLimit();
yield* this.fetchEventsForDateRange(
serviceAccount,
serviceSecret,
projectId,
chunkFrom,
chunkTo,
chunkTo
);
break; // Success, move to next chunk
} catch (error) {
retries++;
const isRateLimit =
error instanceof MixpanelRateLimitError ||
(error instanceof Error && error.message.includes('429'));
const isLastRetry = retries > maxRetries;
this.logger?.warn('Failed to fetch events for date range', {
@@ -80,22 +155,31 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
attempt: retries,
maxRetries,
error: (error as Error).message,
isRateLimit,
willRetry: !isLastRetry,
});
if (isLastRetry) {
// Final attempt failed, re-throw
throw new Error(
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`,
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`
);
}
// Exponential backoff: wait before retrying
const delay = Math.min(1000 * 2 ** (retries - 1), 60_000); // Cap at 1 minute
let delay: number;
if (error instanceof MixpanelRateLimitError && error.retryAfterMs) {
delay = error.retryAfterMs;
} else if (isRateLimit) {
// 5min → 10min → 15min → 15min → 15min = 60min total
delay = Math.min(300_000 * 2 ** (retries - 1), 900_000);
} else {
delay = Math.min(1000 * 2 ** (retries - 1), 60_000);
}
this.logger?.info('Retrying after delay', {
delayMs: delay,
chunkFrom,
chunkTo,
isRateLimit,
});
await new Promise((resolve) => setTimeout(resolve, delay));
}
@@ -108,7 +192,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
serviceSecret: string,
projectId: string,
from: string,
to: string,
to: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const url = 'https://data.mixpanel.com/api/2.0/export';
@@ -134,9 +218,18 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
},
});
if (response.status === 429) {
const retryAfter = response.headers.get('Retry-After');
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
throw new MixpanelRateLimitError(
'Mixpanel rate limit exceeded (429)',
retryAfterMs
);
}
if (!response.ok) {
throw new Error(
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`,
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`
);
}
@@ -153,7 +246,9 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
while (true) {
const { done, value } = await reader.read();
if (done) break;
if (done) {
break;
}
buffer += decoder.decode(value, { stream: true });
@@ -187,7 +282,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
{
line: buffer.substring(0, 100),
error,
},
}
);
}
}
@@ -196,6 +291,114 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
}
}
/**
* Stream user profiles from Mixpanel Engage API.
* Paginates with page/page_size (5k per page) and yields each profile.
*/
async *streamProfiles(): AsyncGenerator<MixpanelRawProfile, void, unknown> {
const { serviceAccount, serviceSecret, projectId } = this.config;
const pageSize = 5000;
let page = 0;
while (true) {
await this.waitForRateLimit();
const url = `https://mixpanel.com/api/query/engage?project_id=${encodeURIComponent(projectId)}`;
const body = new URLSearchParams({
page: String(page),
page_size: String(pageSize),
});
this.logger?.info('Fetching profiles from Mixpanel Engage', {
page,
page_size: pageSize,
projectId,
});
const response = await fetch(url, {
method: 'POST',
headers: {
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
Accept: 'application/json',
'Content-Type': 'application/x-www-form-urlencoded',
},
body: body.toString(),
});
if (response.status === 429) {
const retryAfter = response.headers.get('Retry-After');
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
throw new MixpanelRateLimitError(
'Mixpanel rate limit exceeded (429)',
retryAfterMs
);
}
if (!response.ok) {
const text = await response.text();
throw new Error(
`Failed to fetch profiles from Mixpanel: ${response.status} ${response.statusText} - ${text}`
);
}
const data = (await response.json()) as {
results?: Array<{ $distinct_id: string | number; $properties?: Record<string, unknown> }>;
page?: number;
total?: number;
};
const results = data.results ?? [];
for (const row of results) {
const parsed = zMixpanelRawProfile.safeParse(row);
if (parsed.success) {
yield parsed.data;
} else {
this.logger?.warn('Skipping invalid Mixpanel profile', {
row: JSON.stringify(row).slice(0, 200),
});
}
}
if (results.length < pageSize) {
break;
}
page++;
}
}
/**
* Map Mixpanel Engage profile to OpenPanel IClickhouseProfile.
*/
transformProfile(raw: MixpanelRawProfile): IClickhouseProfile {
const parsed = zMixpanelRawProfile.parse(raw);
const props = (parsed.$properties || {}) as Record<string, unknown>;
const id = String(parsed.$distinct_id).replace(/^\$device:/, '');
const createdAt = props.$created
? formatClickhouseDate(new Date(String(props.$created)))
: formatClickhouseDate(new Date());
const properties: Record<string, string> = {};
const stripPrefix = /^\$/;
for (const [key, value] of Object.entries(props)) {
if (stripPrefix.test(key)) continue;
if (value == null) continue;
properties[key] = typeof value === 'object' ? JSON.stringify(value) : String(value);
}
return {
id,
project_id: this.projectId,
first_name: String(props.$first_name ?? ''),
last_name: String(props.$last_name ?? ''),
email: String(props.$email ?? ''),
avatar: String(props.$avatar ?? props.$image ?? ''),
properties,
created_at: createdAt,
is_external: true,
};
}
validate(rawEvent: MixpanelRawEvent): boolean {
const res = zMixpanelRawEvent.safeParse(rawEvent);
return res.success;
@@ -208,7 +411,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
const deviceId = props.$device_id;
const profileId = String(props.$user_id || props.distinct_id).replace(
/^\$device:/,
'',
''
);
// Build full URL from current_url and current_url_search (web only)
@@ -309,7 +512,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
project_id: projectId,
session_id: '', // Will be generated in SQL after import
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
created_at: new Date(props.time * 1000).toISOString(),
created_at: formatClickhouseDate(new Date(props.time * 1000)),
country,
city,
region,
@@ -318,10 +521,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
os: uaInfo.os || props.$os,
os_version: uaInfo.osVersion || props.$osVersion,
browser: uaInfo.browser || props.$browser,
browser_version:
uaInfo.browserVersion || props.$browserVersion
? String(props.$browser_version)
: '',
browser_version: uaInfo.browserVersion || String(props.$browser_version ?? ''),
device: this.getDeviceType(props.mp_lib, uaInfo, props),
brand: uaInfo.brand || '',
model: uaInfo.model || '',
@@ -338,14 +538,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
sdk_version: this.version,
};
// TODO: Remove this
// Temporary fix for a client
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
event.name = 'screen_view';
event.path = isMightBeScreenView;
}
// TODO: Remove this
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
if (props.utm_source && !properties.__query?.utm_source) {
@@ -371,13 +563,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
private getDeviceType(
mp_lib: string,
uaInfo: UserAgentInfo,
props: Record<string, any>,
props: Record<string, any>
) {
// Normalize lib/os/browser data
const lib = (mp_lib || '').toLowerCase();
const os = String(props.$os || uaInfo.os || '').toLowerCase();
const browser = String(
props.$browser || uaInfo.browser || '',
props.$browser || uaInfo.browser || ''
).toLowerCase();
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
@@ -431,11 +623,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
return !this.isWebEvent(mp_lib);
}
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
const props = rawEvent.properties as Record<string, any>;
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
}
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
// For mobile events, extract device information from Mixpanel properties
const os = props.$os || props.os || '';
@@ -446,19 +633,19 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
return {
isServer: true,
os: os,
osVersion: osVersion,
os,
osVersion,
browser: '',
browserVersion: '',
device: device,
brand: brand,
model: model,
device,
brand,
model,
};
}
private stripMixpanelProperties(
properties: Record<string, any>,
searchParams: Record<string, string>,
searchParams: Record<string, string>
): Record<string, any> {
const strip = [
'time',
@@ -472,8 +659,8 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
];
const filtered = Object.fromEntries(
Object.entries(properties).filter(
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key),
),
([key]) => !(key.match(/^(\$|mp_|utm_)/) || strip.includes(key))
)
);
// Parse JSON strings back to objects/arrays so toDots() can flatten them

View File

@@ -2,10 +2,13 @@ import { randomUUID } from 'node:crypto';
import { Readable } from 'node:stream';
import { pipeline } from 'node:stream/promises';
import { createBrotliDecompress, createGunzip } from 'node:zlib';
import { isSameDomain, parsePath } from '@openpanel/common';
import { generateDeviceId } from '@openpanel/common/server';
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
import type { IClickhouseEvent } from '@openpanel/db';
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import {
generateDeviceId,
getReferrerWithQuery,
parseReferrer,
} from '@openpanel/common/server';
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { IUmamiImportConfig } from '@openpanel/validation';
import { parse } from 'csv-parse';
@@ -63,7 +66,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
constructor(
private readonly projectId: string,
private readonly config: IUmamiImportConfig,
private readonly logger?: ILogger,
private readonly logger?: ILogger
) {
super();
}
@@ -82,7 +85,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
signal?: AbortSignal;
maxBytes?: number;
maxRows?: number;
} = {},
} = {}
): AsyncGenerator<UmamiRawEvent, void, unknown> {
const { signal, maxBytes, maxRows } = opts;
const controller = new AbortController();
@@ -95,9 +98,9 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
}
const res = await fetch(url, { signal: controller.signal });
if (!res.ok || !res.body) {
if (!(res.ok && res.body)) {
throw new Error(
`Failed to fetch remote file: ${res.status} ${res.statusText}`,
`Failed to fetch remote file: ${res.status} ${res.statusText}`
);
}
@@ -108,15 +111,15 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
if (
contentType &&
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
contentType,
contentType
)
) {
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
this.logger?.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
}
if (maxBytes && contentLen && contentLen > maxBytes) {
throw new Error(
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`,
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`
);
}
@@ -137,9 +140,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
if (seenBytes > maxBytes) {
controller.abort();
body.destroy(
new Error(
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
),
new Error(`Stream exceeded size limit (${seenBytes} > ${maxBytes})`)
);
}
});
@@ -190,7 +191,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
throw new Error(
`Failed to parse remote file from ${url}: ${
err instanceof Error ? err.message : String(err)
}`,
}`
);
} finally {
controller.abort(); // ensure fetch stream is torn down
@@ -205,7 +206,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
const projectId =
this.config.projectMapper.find(
(mapper) => mapper.from === _rawEvent.website_id,
(mapper) => mapper.from === _rawEvent.website_id
)?.to || this.projectId;
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
@@ -261,39 +262,50 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
}
// Add useful properties from Umami data
if (rawEvent.page_title) properties.__title = rawEvent.page_title;
if (rawEvent.screen) properties.__screen = rawEvent.screen;
if (rawEvent.language) properties.__language = rawEvent.language;
if (rawEvent.utm_source)
if (rawEvent.page_title) {
properties.__title = rawEvent.page_title;
}
if (rawEvent.screen) {
properties.__screen = rawEvent.screen;
}
if (rawEvent.language) {
properties.__language = rawEvent.language;
}
if (rawEvent.utm_source) {
properties = assocPath(
['__query', 'utm_source'],
rawEvent.utm_source,
properties,
properties
);
if (rawEvent.utm_medium)
}
if (rawEvent.utm_medium) {
properties = assocPath(
['__query', 'utm_medium'],
rawEvent.utm_medium,
properties,
properties
);
if (rawEvent.utm_campaign)
}
if (rawEvent.utm_campaign) {
properties = assocPath(
['__query', 'utm_campaign'],
rawEvent.utm_campaign,
properties,
properties
);
if (rawEvent.utm_content)
}
if (rawEvent.utm_content) {
properties = assocPath(
['__query', 'utm_content'],
rawEvent.utm_content,
properties,
properties
);
if (rawEvent.utm_term)
}
if (rawEvent.utm_term) {
properties = assocPath(
['__query', 'utm_term'],
rawEvent.utm_term,
properties,
properties
);
}
return {
id: rawEvent.event_id || randomUUID(),
@@ -302,8 +314,8 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
profile_id: profileId,
project_id: projectId,
session_id: rawEvent.session_id || '',
properties,
created_at: rawEvent.created_at.toISOString(),
properties: toDots(properties),
created_at: formatClickhouseDate(rawEvent.created_at),
country,
city,
region: this.mapRegion(region),
@@ -329,7 +341,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
}
mapRegion(region: string): string {
return region.replace(/^[A-Z]{2}\-/, '');
return region.replace(/^[A-Z]{2}-/, '');
}
mapDevice(device: string): string {