fix: redo how the importer works

This commit is contained in:
Carl-Gerhard Lindesvärd
2026-03-01 21:48:46 +01:00
parent 6251d143d1
commit 647ac2a4af
8 changed files with 993 additions and 984 deletions

View File

@@ -1,17 +1,16 @@
import { import {
type IClickhouseEvent,
type ImportSteps,
type Prisma,
backfillSessionsToProduction, backfillSessionsToProduction,
cleanupStagingData,
createSessionsStartEndEvents, createSessionsStartEndEvents,
db, db,
formatClickhouseDate, generateGapBasedSessionIds,
generateSessionIds,
getImportDateBounds, getImportDateBounds,
getImportProgress, type IClickhouseEvent,
type IClickhouseProfile,
insertImportBatch, insertImportBatch,
markImportComplete, insertProfilesBatch,
moveImportsToProduction, moveImportsToProduction,
type Prisma,
updateImportStatus, updateImportStatus,
} from '@openpanel/db'; } from '@openpanel/db';
import { MixpanelProvider, UmamiProvider } from '@openpanel/importer'; import { MixpanelProvider, UmamiProvider } from '@openpanel/importer';
@@ -22,294 +21,245 @@ import { logger } from '../utils/logger';
const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10); const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10);
/** function yieldToEventLoop(): Promise<void> {
* Yields control back to the event loop to prevent stalled jobs
*/
async function yieldToEventLoop(): Promise<void> {
return new Promise((resolve) => { return new Promise((resolve) => {
setTimeout(resolve, 100); setTimeout(resolve, 100);
}); });
} }
const PRODUCTION_STEPS = ['moving', 'backfilling_sessions'];
export async function importJob(job: Job<ImportQueuePayload>) { export async function importJob(job: Job<ImportQueuePayload>) {
const { importId } = job.data.payload; const { importId } = job.data.payload;
const record = await db.$primary().import.findUniqueOrThrow({ const record = await db.$primary().import.findUniqueOrThrow({
where: { id: importId }, where: { id: importId },
include: { include: { project: true },
project: true,
},
}); });
const jobLogger = logger.child({ const jobLogger = logger.child({ importId, config: record.config });
importId,
config: record.config,
});
type ValidStep = Exclude<ImportSteps, 'failed' | 'completed'>;
const steps: Record<ValidStep, number> = {
loading: 0,
generating_session_ids: 1,
creating_sessions: 2,
moving: 3,
backfilling_sessions: 4,
};
jobLogger.info('Starting import job'); jobLogger.info('Starting import job');
const providerInstance = createProvider(record, jobLogger); const providerInstance = createProvider(record, jobLogger);
const shouldGenerateSessionIds = providerInstance.shouldGenerateSessionIds();
try { try {
// Check if this is a resume operation const isRetry = record.currentStep !== null;
const isNewImport = record.currentStep === null; const hasReachedProduction =
isRetry && PRODUCTION_STEPS.includes(record.currentStep as string);
if (isNewImport) { // -------------------------------------------------------
await updateImportStatus(jobLogger, job, importId, { // STAGING PHASE: clean slate on failure, run from scratch
step: 'loading', // -------------------------------------------------------
}); if (!hasReachedProduction) {
} else { if (isRetry) {
jobLogger.info('Resuming import from previous state', { jobLogger.info(
currentStep: record.currentStep, 'Retry detected before production phase — cleaning staging data'
currentBatch: record.currentBatch, );
}); await cleanupStagingData(importId);
}
// Try to get a precomputed total for better progress reporting
const totalEvents = await providerInstance
.getTotalEventsCount()
.catch(() => -1);
let processedEvents = record.processedEvents;
const resumeLoadingFrom =
(record.currentStep === 'loading' && record.currentBatch) || undefined;
const resumeGeneratingSessionIdsFrom =
(record.currentStep === 'generating_session_ids' &&
record.currentBatch) ||
undefined;
const resumeCreatingSessionsFrom =
(record.currentStep === 'creating_sessions' && record.currentBatch) ||
undefined;
const resumeMovingFrom =
(record.currentStep === 'moving' && record.currentBatch) || undefined;
const resumeBackfillingSessionsFrom =
(record.currentStep === 'backfilling_sessions' && record.currentBatch) ||
undefined;
// Example:
// shouldRunStep(0) // currStep = 2 (should not run)
// shouldRunStep(1) // currStep = 2 (should not run)
// shouldRunStep(2) // currStep = 2 (should run)
// shouldRunStep(3) // currStep = 2 (should run)
const shouldRunStep = (step: ValidStep) => {
if (isNewImport) {
return true;
} }
const stepToRunIndex = steps[step]; // Phase 1: Load events into staging
const currentStepIndex = steps[record.currentStep as ValidStep]; await updateImportStatus(jobLogger, job, importId, { step: 'loading' });
return stepToRunIndex >= currentStepIndex;
};
async function whileBounds( const totalEvents = await providerInstance
from: string | undefined, .getTotalEventsCount()
callback: (from: string, to: string) => Promise<void>, .catch(() => -1);
) { let processedEvents = 0;
const bounds = await getImportDateBounds(importId, from); const eventBatch: IClickhouseEvent[] = [];
if (bounds.min && bounds.max) {
const start = new Date(bounds.min);
const end = new Date(bounds.max);
let cursor = new Date(start);
while (cursor < end) {
const next = new Date(cursor);
next.setDate(next.getDate() + 1);
await callback(
formatClickhouseDate(cursor, true),
formatClickhouseDate(next, true),
);
cursor = next;
// Yield control back to event loop after processing each day for await (const rawEvent of providerInstance.parseSource()) {
await yieldToEventLoop();
}
}
}
// Phase 1: Fetch & Transform - Process events in batches
if (shouldRunStep('loading')) {
const eventBatch: any = [];
for await (const rawEvent of providerInstance.parseSource(
resumeLoadingFrom,
)) {
// Validate event
if ( if (
!providerInstance.validate( !providerInstance.validate(
// @ts-expect-error // @ts-expect-error -- provider-specific raw type
rawEvent, rawEvent
) )
) { ) {
jobLogger.warn('Skipping invalid event', { rawEvent }); jobLogger.warn('Skipping invalid event', { rawEvent });
continue; continue;
} }
eventBatch.push(rawEvent); const transformed: IClickhouseEvent = providerInstance.transformEvent(
// @ts-expect-error -- provider-specific raw type
rawEvent
);
// Session IDs for providers that need them (e.g. Mixpanel) are generated
// in generateGapBasedSessionIds after loading, using gap-based logic.
eventBatch.push(transformed);
// Process batch when it reaches the batch size
if (eventBatch.length >= BATCH_SIZE) { if (eventBatch.length >= BATCH_SIZE) {
jobLogger.info('Processing batch', { batchSize: eventBatch.length }); await insertImportBatch(eventBatch, importId);
const transformedEvents: IClickhouseEvent[] = eventBatch.map(
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
processedEvents += eventBatch.length; processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '') const batchDate = new Date(eventBatch[0]?.created_at || '')
.toISOString() .toISOString()
.split('T')[0]; .split('T')[0];
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'loading', step: 'loading',
batch: createdAt, batch: batchDate,
totalEvents, totalEvents,
processedEvents, processedEvents,
}); });
// Yield control back to event loop after processing each batch eventBatch.length = 0;
await yieldToEventLoop(); await yieldToEventLoop();
} }
} }
// Process remaining events in the last batch
if (eventBatch.length > 0) { if (eventBatch.length > 0) {
const transformedEvents = eventBatch.map( await insertImportBatch(eventBatch, importId);
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
processedEvents += eventBatch.length; processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '') const batchDate = new Date(eventBatch[0]?.created_at || '')
.toISOString() .toISOString()
.split('T')[0]; .split('T')[0];
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'loading', step: 'loading',
batch: createdAt, batch: batchDate,
totalEvents, totalEvents,
processedEvents, processedEvents,
}); });
eventBatch.length = 0;
}
// Yield control back to event loop after processing final batch jobLogger.info('Loading complete', { processedEvents });
// Phase 1b: Load user profiles (Mixpanel only)
const profileBatchSize = 5000;
if (
'streamProfiles' in providerInstance &&
typeof (providerInstance as MixpanelProvider).streamProfiles ===
'function'
) {
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
});
const profileBatch: IClickhouseProfile[] = [];
let processedProfiles = 0;
for await (const rawProfile of (
providerInstance as MixpanelProvider
).streamProfiles()) {
const profile = (
providerInstance as MixpanelProvider
).transformProfile(rawProfile);
profileBatch.push(profile);
if (profileBatch.length >= profileBatchSize) {
await insertProfilesBatch(profileBatch, record.projectId);
processedProfiles += profileBatch.length;
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
processedProfiles,
});
profileBatch.length = 0;
await yieldToEventLoop();
}
}
if (profileBatch.length > 0) {
await insertProfilesBatch(profileBatch, record.projectId);
processedProfiles += profileBatch.length;
await updateImportStatus(jobLogger, job, importId, {
step: 'loading_profiles',
processedProfiles,
totalProfiles: processedProfiles,
});
}
jobLogger.info('Profile loading complete', { processedProfiles });
}
// Phase 2: Generate gap-based session IDs (Mixpanel etc.)
if (shouldGenerateSessionIds) {
await updateImportStatus(jobLogger, job, importId, {
step: 'generating_sessions',
});
await generateGapBasedSessionIds(importId);
await yieldToEventLoop(); await yieldToEventLoop();
jobLogger.info('Session ID generation complete');
}
// Phase 3: Create session_start / session_end events
await updateImportStatus(jobLogger, job, importId, {
step: 'creating_sessions',
batch: 'all sessions',
});
await createSessionsStartEndEvents(importId);
await yieldToEventLoop();
jobLogger.info('Session event creation complete');
}
// -------------------------------------------------------
// PRODUCTION PHASE: resume-safe, track progress per batch
// -------------------------------------------------------
// Phase 3: Move staging events to production (per-day)
const resumeMovingFrom =
hasReachedProduction && record.currentStep === 'moving'
? (record.currentBatch ?? undefined)
: undefined;
// currentBatch is the last successfully completed day — resume from the next day to avoid re-inserting it
const moveFromDate = (() => {
if (!resumeMovingFrom) return undefined;
const next = new Date(`${resumeMovingFrom}T12:00:00Z`);
next.setUTCDate(next.getUTCDate() + 1);
return next.toISOString().split('T')[0]!;
})();
const bounds = await getImportDateBounds(importId, moveFromDate);
if (bounds.min && bounds.max) {
const startDate = bounds.min.split(' ')[0]!;
const endDate = bounds.max.split(' ')[0]!;
const cursor = new Date(`${startDate}T12:00:00Z`);
const end = new Date(`${endDate}T12:00:00Z`);
while (cursor <= end) {
const dateStr = cursor.toISOString().split('T')[0]!;
await moveImportsToProduction(importId, dateStr);
await updateImportStatus(jobLogger, job, importId, {
step: 'moving',
batch: dateStr,
});
await yieldToEventLoop();
cursor.setUTCDate(cursor.getUTCDate() + 1);
} }
} }
// Phase 2: Generate session IDs if provider requires it jobLogger.info('Move to production complete');
if (
shouldRunStep('generating_session_ids') &&
providerInstance.shouldGenerateSessionIds()
) {
await whileBounds(resumeGeneratingSessionIdsFrom, async (from) => {
console.log('Generating session IDs', { from });
await generateSessionIds(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'generating_session_ids',
batch: from,
});
// Yield control back to event loop after processing each day // Phase 4: Backfill sessions table
await yieldToEventLoop();
});
jobLogger.info('Session ID generation complete');
}
// Phase 3-5: Process in daily batches for robustness
if (shouldRunStep('creating_sessions')) {
await whileBounds(resumeCreatingSessionsFrom, async (from) => {
await createSessionsStartEndEvents(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'creating_sessions',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
if (shouldRunStep('moving')) {
await whileBounds(resumeMovingFrom, async (from) => {
await moveImportsToProduction(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'moving',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
if (shouldRunStep('backfilling_sessions')) {
await whileBounds(resumeBackfillingSessionsFrom, async (from) => {
await backfillSessionsToProduction(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'backfilling_sessions',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
await markImportComplete(importId);
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'completed', step: 'backfilling_sessions',
batch: 'all sessions',
}); });
jobLogger.info('Import marked as complete'); await backfillSessionsToProduction(importId);
await yieldToEventLoop();
// Get final progress jobLogger.info('Session backfill complete');
const finalProgress = await getImportProgress(importId);
jobLogger.info('Import job completed successfully', { // Done
totalEvents: finalProgress.totalEvents, await updateImportStatus(jobLogger, job, importId, { step: 'completed' });
insertedEvents: finalProgress.insertedEvents, jobLogger.info('Import completed');
status: finalProgress.status,
});
return { return { success: true };
success: true,
totalEvents: finalProgress.totalEvents,
processedEvents: finalProgress.insertedEvents,
};
} catch (error) { } catch (error) {
jobLogger.error('Import job failed', { error }); jobLogger.error('Import job failed', { error });
// Mark import as failed
try { try {
const errorMsg = error instanceof Error ? error.message : 'Unknown error'; const errorMsg = error instanceof Error ? error.message : 'Unknown error';
await updateImportStatus(jobLogger, job, importId, { await updateImportStatus(jobLogger, job, importId, {
step: 'failed', step: 'failed',
errorMessage: errorMsg, errorMessage: errorMsg,
}); });
jobLogger.warn('Import marked as failed', { error: errorMsg });
} catch (markError) { } catch (markError) {
jobLogger.error('Failed to mark import as failed', { error, markError }); jobLogger.error('Failed to mark import as failed', { error, markError });
} }
@@ -320,7 +270,7 @@ export async function importJob(job: Job<ImportQueuePayload>) {
function createProvider( function createProvider(
record: Prisma.ImportGetPayload<{ include: { project: true } }>, record: Prisma.ImportGetPayload<{ include: { project: true } }>,
jobLogger: ILogger, jobLogger: ILogger
) { ) {
const config = record.config; const config = record.config;
switch (config.provider) { switch (config.provider) {

View File

@@ -1,6 +1,5 @@
export * from './src/prisma-client'; export * from './src/prisma-client';
export * from './src/clickhouse/client'; export * from './src/clickhouse/client';
export * from './src/clickhouse/csv';
export * from './src/sql-builder'; export * from './src/sql-builder';
export * from './src/services/chart.service'; export * from './src/services/chart.service';
export * from './src/engine'; export * from './src/engine';

View File

@@ -1,11 +1,9 @@
import { Readable } from 'node:stream';
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client'; import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
import { ClickHouseLogLevel, createClient } from '@clickhouse/client'; import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
import sqlstring from 'sqlstring';
import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config'; import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config';
import { createLogger } from '@openpanel/logger'; import { createLogger } from '@openpanel/logger';
import type { IInterval } from '@openpanel/validation'; import type { IInterval } from '@openpanel/validation';
import sqlstring from 'sqlstring';
export { createClient }; export { createClient };
@@ -68,8 +66,11 @@ export const TABLE_NAMES = {
* Non-clustered mode = self-hosted environments * Non-clustered mode = self-hosted environments
*/ */
export function isClickhouseClustered(): boolean { export function isClickhouseClustered(): boolean {
if (process.env.CLICKHOUSE_CLUSTER === 'true' || process.env.CLICKHOUSE_CLUSTER === '1') { if (
return true process.env.CLICKHOUSE_CLUSTER === 'true' ||
process.env.CLICKHOUSE_CLUSTER === '1'
) {
return true;
} }
return !( return !(
@@ -97,21 +98,21 @@ function getClickhouseSettings(): ClickHouseSettings {
return { return {
distributed_product_mode: 'allow', distributed_product_mode: 'allow',
date_time_input_format: 'best_effort', date_time_input_format: 'best_effort',
...(!process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN ...(process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
? { ? {}
: {
query_plan_convert_any_join_to_semi_or_anti_join: 0, query_plan_convert_any_join_to_semi_or_anti_join: 0,
} }),
: {}),
...additionalSettings, ...additionalSettings,
}; };
} }
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = { export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
max_open_connections: 30, max_open_connections: 30,
request_timeout: 300000, request_timeout: 300_000,
keep_alive: { keep_alive: {
enabled: true, enabled: true,
idle_socket_ttl: 60000, idle_socket_ttl: 60_000,
}, },
compression: { compression: {
request: true, request: true,
@@ -138,7 +139,7 @@ const cleanQuery = (query?: string) =>
export async function withRetry<T>( export async function withRetry<T>(
operation: () => Promise<T>, operation: () => Promise<T>,
maxRetries = 3, maxRetries = 3,
baseDelay = 500, baseDelay = 500
): Promise<T> { ): Promise<T> {
let lastError: Error | undefined; let lastError: Error | undefined;
@@ -162,7 +163,7 @@ export async function withRetry<T>(
`Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`, `Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`,
{ {
error: error.message, error: error.message,
}, }
); );
await new Promise((resolve) => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
continue; continue;
@@ -213,7 +214,7 @@ export const ch = new Proxy(originalCh, {
export async function chQueryWithMeta<T extends Record<string, any>>( export async function chQueryWithMeta<T extends Record<string, any>>(
query: string, query: string,
clickhouseSettings?: ClickHouseSettings, clickhouseSettings?: ClickHouseSettings
): Promise<ResponseJSON<T>> { ): Promise<ResponseJSON<T>> {
const start = Date.now(); const start = Date.now();
const res = await ch.query({ const res = await ch.query({
@@ -249,44 +250,16 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
return response; return response;
} }
export async function chInsertCSV(tableName: string, rows: string[]) {
try {
const now = performance.now();
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
const csvStream = Readable.from(rows.join('\n'), {
objectMode: false,
});
await ch.insert({
table: tableName,
values: csvStream,
format: 'CSV',
clickhouse_settings: {
format_csv_allow_double_quotes: 1,
format_csv_allow_single_quotes: 0,
},
});
logger.info('CSV Insert successful', {
elapsed: performance.now() - now,
rows: rows.length,
});
} catch (error) {
logger.error('CSV Insert failed:', error);
throw error;
}
}
export async function chQuery<T extends Record<string, any>>( export async function chQuery<T extends Record<string, any>>(
query: string, query: string,
clickhouseSettings?: ClickHouseSettings, clickhouseSettings?: ClickHouseSettings
): Promise<T[]> { ): Promise<T[]> {
return (await chQueryWithMeta<T>(query, clickhouseSettings)).data; return (await chQueryWithMeta<T>(query, clickhouseSettings)).data;
} }
export function formatClickhouseDate( export function formatClickhouseDate(
date: Date | string, date: Date | string,
skipTime = false, skipTime = false
): string { ): string {
if (skipTime) { if (skipTime) {
return new Date(date).toISOString().split('T')[0]!; return new Date(date).toISOString().split('T')[0]!;

View File

@@ -1,53 +0,0 @@
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
// Format: '{'key1':'value1','key2':'value2'}'
// Single quotes inside values must be escaped with backslash: \'
// We also need to escape newlines and control characters to prevent CSV parsing issues
const escapeMapValue = (str: string) => {
return str
.replace(/\\/g, '\\\\') // Escape backslashes first
.replace(/'/g, "\\'") // Escape single quotes
.replace(/\n/g, '\\n') // Escape newlines
.replace(/\r/g, '\\r') // Escape carriage returns
.replace(/\t/g, '\\t') // Escape tabs
.replace(/\0/g, '\\0'); // Escape null bytes
};
export const csvEscapeJson = (
value: Record<string, unknown> | null | undefined,
): string => {
if (value == null) return '';
// Normalize to strings if your column is Map(String,String)
const normalized: Record<string, string> = Object.fromEntries(
Object.entries(value).map(([k, v]) => [
String(k),
v == null ? '' : String(v),
]),
);
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
if (Object.keys(normalized).length === 0) return '{}';
const pairs = Object.entries(normalized)
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
.join(',');
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
return csvEscapeField(`{${pairs}}`);
};
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
// Double quotes inside must be doubled (""), per CSV standard
export const csvEscapeField = (value: string | number): string => {
const str = String(value);
// If field contains commas, quotes, or newlines, it must be quoted
if (/[,"\n\r]/.test(str)) {
// Escape double quotes by doubling them
const escaped = str.replace(/"/g, '""');
return `"${escaped}"`;
}
return str;
};

File diff suppressed because it is too large Load Diff

View File

@@ -39,7 +39,7 @@ describe('mixpanel', () => {
const rawEvent = { const rawEvent = {
event: '$mp_web_page_view', event: '$mp_web_page_view',
properties: { properties: {
time: 1746097970, time: 1_746_097_970,
distinct_id: '$device:123', distinct_id: '$device:123',
$browser: 'Chrome', $browser: 'Chrome',
$browser_version: 135, $browser_version: 135,
@@ -53,7 +53,7 @@ describe('mixpanel', () => {
$insert_id: 'source_id', $insert_id: 'source_id',
$lib_version: '2.60.0', $lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com', $mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1746078175363, $mp_api_timestamp_ms: 1_746_078_175_363,
$mp_autocapture: true, $mp_autocapture: true,
$os: 'Android', $os: 'Android',
$referrer: 'https://google.com/', $referrer: 'https://google.com/',
@@ -71,7 +71,7 @@ describe('mixpanel', () => {
gclid: 'oqneoqow', gclid: 'oqneoqow',
mp_country_code: 'IN', mp_country_code: 'IN',
mp_lib: 'web', mp_lib: 'web',
mp_processing_time_ms: 1746078175546, mp_processing_time_ms: 1_746_078_175_546,
mp_sent_by_lib_version: '2.60.0', mp_sent_by_lib_version: '2.60.0',
utm_medium: 'cpc', utm_medium: 'cpc',
utm_source: 'google', utm_source: 'google',
@@ -101,7 +101,7 @@ describe('mixpanel', () => {
__title: __title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2', 'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
}, },
created_at: '2025-05-01T11:12:50.000Z', created_at: '2025-05-01 11:12:50',
country: 'IN', country: 'IN',
city: 'Mumbai', city: 'Mumbai',
region: 'Maharashtra', region: 'Maharashtra',
@@ -110,7 +110,7 @@ describe('mixpanel', () => {
os: 'Android', os: 'Android',
os_version: undefined, os_version: undefined,
browser: 'Chrome', browser: 'Chrome',
browser_version: '', browser_version: '135',
device: 'mobile', device: 'mobile',
brand: '', brand: '',
model: '', model: '',
@@ -141,7 +141,7 @@ describe('mixpanel', () => {
const rawEvent = { const rawEvent = {
event: 'custom_event', event: 'custom_event',
properties: { properties: {
time: 1746097970, time: 1_746_097_970,
distinct_id: '$device:123', distinct_id: '$device:123',
$device_id: '123', $device_id: '123',
$user_id: 'user123', $user_id: 'user123',
@@ -192,7 +192,7 @@ describe('mixpanel', () => {
const rawEvent = { const rawEvent = {
event: 'ec_search_error', event: 'ec_search_error',
properties: { properties: {
time: 1759947367, time: 1_759_947_367,
distinct_id: '3385916', distinct_id: '3385916',
$browser: 'Mobile Safari', $browser: 'Mobile Safari',
$browser_version: null, $browser_version: null,
@@ -207,7 +207,7 @@ describe('mixpanel', () => {
$insert_id: 'bclkaepeqcfuzt4v', $insert_id: 'bclkaepeqcfuzt4v',
$lib_version: '2.60.0', $lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com', $mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1759927570699, $mp_api_timestamp_ms: 1_759_927_570_699,
$os: 'iOS', $os: 'iOS',
$region: 'Karnataka', $region: 'Karnataka',
$screen_height: 852, $screen_height: 852,
@@ -225,7 +225,7 @@ describe('mixpanel', () => {
language: 'english', language: 'english',
mp_country_code: 'IN', mp_country_code: 'IN',
mp_lib: 'web', mp_lib: 'web',
mp_processing_time_ms: 1759927592421, mp_processing_time_ms: 1_759_927_592_421,
mp_sent_by_lib_version: '2.60.0', mp_sent_by_lib_version: '2.60.0',
os: 'web', os: 'web',
osVersion: osVersion:
@@ -249,15 +249,15 @@ describe('mixpanel', () => {
expect(res.id.length).toBeGreaterThan(30); expect(res.id.length).toBeGreaterThan(30);
expect(res.imported_at).toMatch( expect(res.imported_at).toMatch(
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/, /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
); );
expect(omit(['id', 'imported_at'], res)).toEqual({ expect(omit(['id', 'imported_at'], res)).toEqual({
brand: 'Apple', brand: 'Apple',
browser: 'GSA', browser: 'GSA',
browser_version: 'null', browser_version: '388.0.811331708',
city: 'Bengaluru', city: 'Bengaluru',
country: 'IN', country: 'IN',
created_at: '2025-10-08T18:16:07.000Z', created_at: '2025-10-08 18:16:07',
device: 'mobile', device: 'mobile',
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c', device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
duration: 0, duration: 0,

View File

@@ -1,8 +1,13 @@
import { randomUUID } from 'node:crypto'; import { randomUUID } from 'node:crypto';
import { isSameDomain, parsePath, toDots } from '@openpanel/common'; import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server'; import {
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server'; getReferrerWithQuery,
import type { IClickhouseEvent } from '@openpanel/db'; parseReferrer,
parseUserAgent,
type UserAgentInfo,
} from '@openpanel/common/server';
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
import type { IClickhouseProfile } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger'; import type { ILogger } from '@openpanel/logger';
import type { IMixpanelImportConfig } from '@openpanel/validation'; import type { IMixpanelImportConfig } from '@openpanel/validation';
import { z } from 'zod'; import { z } from 'zod';
@@ -15,22 +20,88 @@ export const zMixpanelRawEvent = z.object({
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>; export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
/** Engage API profile: https://docs.mixpanel.com/docs/export-methods#exporting-profiles */
export const zMixpanelRawProfile = z.object({
$distinct_id: z.union([z.string(), z.number()]),
$properties: z.record(z.unknown()).optional().default({}),
});
export type MixpanelRawProfile = z.infer<typeof zMixpanelRawProfile>;
class MixpanelRateLimitError extends Error {
readonly retryAfterMs?: number;
constructor(message: string, retryAfterMs?: number) {
super(message);
this.name = 'MixpanelRateLimitError';
this.retryAfterMs = retryAfterMs;
}
}
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> { export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
provider = 'mixpanel'; provider = 'mixpanel';
version = '1.0.0'; version = '1.0.0';
private static readonly MAX_REQUESTS_PER_HOUR = 100;
private static readonly MIN_REQUEST_INTERVAL_MS = 334; // 3 QPS limit
private requestTimestamps: number[] = [];
private lastRequestTime = 0;
constructor( constructor(
private readonly projectId: string, private readonly projectId: string,
private readonly config: IMixpanelImportConfig, private readonly config: IMixpanelImportConfig,
private readonly logger?: ILogger, private readonly logger?: ILogger
) { ) {
super(); super();
} }
async getTotalEventsCount(): Promise<number> { private async waitForRateLimit(): Promise<void> {
const now = Date.now();
const oneHourAgo = now - 60 * 60 * 1000;
// Prune timestamps older than 1 hour
this.requestTimestamps = this.requestTimestamps.filter(
(t) => t > oneHourAgo
);
// Enforce per-second limit (3 QPS → min 334ms gap)
const timeSinceLast = now - this.lastRequestTime;
if (timeSinceLast < MixpanelProvider.MIN_REQUEST_INTERVAL_MS) {
const delay = MixpanelProvider.MIN_REQUEST_INTERVAL_MS - timeSinceLast;
await new Promise((resolve) => setTimeout(resolve, delay));
}
// Enforce hourly limit
if (
this.requestTimestamps.length >= MixpanelProvider.MAX_REQUESTS_PER_HOUR
) {
const oldestInWindow = this.requestTimestamps[0]!;
const waitUntil = oldestInWindow + 60 * 60 * 1000;
const waitMs = waitUntil - Date.now() + 1000; // +1s buffer
if (waitMs > 0) {
this.logger?.info(
`Rate limit: ${this.requestTimestamps.length} requests in the last hour, waiting ${Math.ceil(waitMs / 1000)}s`,
{
requestsInWindow: this.requestTimestamps.length,
waitMs,
}
);
await new Promise((resolve) => setTimeout(resolve, waitMs));
// Prune again after waiting
this.requestTimestamps = this.requestTimestamps.filter(
(t) => t > Date.now() - 60 * 60 * 1000
);
}
}
this.lastRequestTime = Date.now();
this.requestTimestamps.push(Date.now());
}
getTotalEventsCount(): Promise<number> {
// Mixpanel sucks and dont provide a good way to extract total event count within a period // Mixpanel sucks and dont provide a good way to extract total event count within a period
// jql would work but not accurate and will be deprecated end of 2025 // jql would work but not accurate and will be deprecated end of 2025
return -1; return Promise.resolve(-1);
} }
/** /**
@@ -42,13 +113,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
} }
async *parseSource( async *parseSource(
overrideFrom?: string, overrideFrom?: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> { ): AsyncGenerator<MixpanelRawEvent, void, unknown> {
yield* this.fetchEventsFromMixpanel(overrideFrom); yield* this.fetchEventsFromMixpanel(overrideFrom);
} }
private async *fetchEventsFromMixpanel( private async *fetchEventsFromMixpanel(
overrideFrom?: string, overrideFrom?: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> { ): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const { serviceAccount, serviceSecret, projectId, from, to } = this.config; const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
@@ -58,20 +129,24 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
for (const [chunkFrom, chunkTo] of dateChunks) { for (const [chunkFrom, chunkTo] of dateChunks) {
let retries = 0; let retries = 0;
const maxRetries = 3; const maxRetries = 6;
while (retries <= maxRetries) { while (retries <= maxRetries) {
try { try {
await this.waitForRateLimit();
yield* this.fetchEventsForDateRange( yield* this.fetchEventsForDateRange(
serviceAccount, serviceAccount,
serviceSecret, serviceSecret,
projectId, projectId,
chunkFrom, chunkFrom,
chunkTo, chunkTo
); );
break; // Success, move to next chunk break; // Success, move to next chunk
} catch (error) { } catch (error) {
retries++; retries++;
const isRateLimit =
error instanceof MixpanelRateLimitError ||
(error instanceof Error && error.message.includes('429'));
const isLastRetry = retries > maxRetries; const isLastRetry = retries > maxRetries;
this.logger?.warn('Failed to fetch events for date range', { this.logger?.warn('Failed to fetch events for date range', {
@@ -80,22 +155,31 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
attempt: retries, attempt: retries,
maxRetries, maxRetries,
error: (error as Error).message, error: (error as Error).message,
isRateLimit,
willRetry: !isLastRetry, willRetry: !isLastRetry,
}); });
if (isLastRetry) { if (isLastRetry) {
// Final attempt failed, re-throw
throw new Error( throw new Error(
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`, `Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`
); );
} }
// Exponential backoff: wait before retrying let delay: number;
const delay = Math.min(1000 * 2 ** (retries - 1), 60_000); // Cap at 1 minute if (error instanceof MixpanelRateLimitError && error.retryAfterMs) {
delay = error.retryAfterMs;
} else if (isRateLimit) {
// 5min → 10min → 15min → 15min → 15min = 60min total
delay = Math.min(300_000 * 2 ** (retries - 1), 900_000);
} else {
delay = Math.min(1000 * 2 ** (retries - 1), 60_000);
}
this.logger?.info('Retrying after delay', { this.logger?.info('Retrying after delay', {
delayMs: delay, delayMs: delay,
chunkFrom, chunkFrom,
chunkTo, chunkTo,
isRateLimit,
}); });
await new Promise((resolve) => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
@@ -108,7 +192,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
serviceSecret: string, serviceSecret: string,
projectId: string, projectId: string,
from: string, from: string,
to: string, to: string
): AsyncGenerator<MixpanelRawEvent, void, unknown> { ): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const url = 'https://data.mixpanel.com/api/2.0/export'; const url = 'https://data.mixpanel.com/api/2.0/export';
@@ -134,9 +218,18 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
}, },
}); });
if (response.status === 429) {
const retryAfter = response.headers.get('Retry-After');
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
throw new MixpanelRateLimitError(
'Mixpanel rate limit exceeded (429)',
retryAfterMs
);
}
if (!response.ok) { if (!response.ok) {
throw new Error( throw new Error(
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`, `Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`
); );
} }
@@ -153,7 +246,9 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
while (true) { while (true) {
const { done, value } = await reader.read(); const { done, value } = await reader.read();
if (done) break; if (done) {
break;
}
buffer += decoder.decode(value, { stream: true }); buffer += decoder.decode(value, { stream: true });
@@ -187,7 +282,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
{ {
line: buffer.substring(0, 100), line: buffer.substring(0, 100),
error, error,
}, }
); );
} }
} }
@@ -196,6 +291,114 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
} }
} }
/**
* Stream user profiles from Mixpanel Engage API.
* Paginates with page/page_size (5k per page) and yields each profile.
*/
async *streamProfiles(): AsyncGenerator<MixpanelRawProfile, void, unknown> {
const { serviceAccount, serviceSecret, projectId } = this.config;
const pageSize = 5000;
let page = 0;
while (true) {
await this.waitForRateLimit();
const url = `https://mixpanel.com/api/query/engage?project_id=${encodeURIComponent(projectId)}`;
const body = new URLSearchParams({
page: String(page),
page_size: String(pageSize),
});
this.logger?.info('Fetching profiles from Mixpanel Engage', {
page,
page_size: pageSize,
projectId,
});
const response = await fetch(url, {
method: 'POST',
headers: {
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
Accept: 'application/json',
'Content-Type': 'application/x-www-form-urlencoded',
},
body: body.toString(),
});
if (response.status === 429) {
const retryAfter = response.headers.get('Retry-After');
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
throw new MixpanelRateLimitError(
'Mixpanel rate limit exceeded (429)',
retryAfterMs
);
}
if (!response.ok) {
const text = await response.text();
throw new Error(
`Failed to fetch profiles from Mixpanel: ${response.status} ${response.statusText} - ${text}`
);
}
const data = (await response.json()) as {
results?: Array<{ $distinct_id: string | number; $properties?: Record<string, unknown> }>;
page?: number;
total?: number;
};
const results = data.results ?? [];
for (const row of results) {
const parsed = zMixpanelRawProfile.safeParse(row);
if (parsed.success) {
yield parsed.data;
} else {
this.logger?.warn('Skipping invalid Mixpanel profile', {
row: JSON.stringify(row).slice(0, 200),
});
}
}
if (results.length < pageSize) {
break;
}
page++;
}
}
/**
* Map Mixpanel Engage profile to OpenPanel IClickhouseProfile.
*/
transformProfile(raw: MixpanelRawProfile): IClickhouseProfile {
const parsed = zMixpanelRawProfile.parse(raw);
const props = (parsed.$properties || {}) as Record<string, unknown>;
const id = String(parsed.$distinct_id).replace(/^\$device:/, '');
const createdAt = props.$created
? formatClickhouseDate(new Date(String(props.$created)))
: formatClickhouseDate(new Date());
const properties: Record<string, string> = {};
const stripPrefix = /^\$/;
for (const [key, value] of Object.entries(props)) {
if (stripPrefix.test(key)) continue;
if (value == null) continue;
properties[key] = typeof value === 'object' ? JSON.stringify(value) : String(value);
}
return {
id,
project_id: this.projectId,
first_name: String(props.$first_name ?? ''),
last_name: String(props.$last_name ?? ''),
email: String(props.$email ?? ''),
avatar: String(props.$avatar ?? props.$image ?? ''),
properties,
created_at: createdAt,
is_external: true,
};
}
validate(rawEvent: MixpanelRawEvent): boolean { validate(rawEvent: MixpanelRawEvent): boolean {
const res = zMixpanelRawEvent.safeParse(rawEvent); const res = zMixpanelRawEvent.safeParse(rawEvent);
return res.success; return res.success;
@@ -208,7 +411,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
const deviceId = props.$device_id; const deviceId = props.$device_id;
const profileId = String(props.$user_id || props.distinct_id).replace( const profileId = String(props.$user_id || props.distinct_id).replace(
/^\$device:/, /^\$device:/,
'', ''
); );
// Build full URL from current_url and current_url_search (web only) // Build full URL from current_url and current_url_search (web only)
@@ -309,7 +512,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
project_id: projectId, project_id: projectId,
session_id: '', // Will be generated in SQL after import session_id: '', // Will be generated in SQL after import
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String) properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
created_at: new Date(props.time * 1000).toISOString(), created_at: formatClickhouseDate(new Date(props.time * 1000)),
country, country,
city, city,
region, region,
@@ -318,10 +521,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
os: uaInfo.os || props.$os, os: uaInfo.os || props.$os,
os_version: uaInfo.osVersion || props.$osVersion, os_version: uaInfo.osVersion || props.$osVersion,
browser: uaInfo.browser || props.$browser, browser: uaInfo.browser || props.$browser,
browser_version: browser_version: uaInfo.browserVersion || String(props.$browser_version ?? ''),
uaInfo.browserVersion || props.$browserVersion
? String(props.$browser_version)
: '',
device: this.getDeviceType(props.mp_lib, uaInfo, props), device: this.getDeviceType(props.mp_lib, uaInfo, props),
brand: uaInfo.brand || '', brand: uaInfo.brand || '',
model: uaInfo.model || '', model: uaInfo.model || '',
@@ -338,14 +538,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
sdk_version: this.version, sdk_version: this.version,
}; };
// TODO: Remove this
// Temporary fix for a client
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
event.name = 'screen_view';
event.path = isMightBeScreenView;
}
// TODO: Remove this // TODO: Remove this
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects) // This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
if (props.utm_source && !properties.__query?.utm_source) { if (props.utm_source && !properties.__query?.utm_source) {
@@ -371,13 +563,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
private getDeviceType( private getDeviceType(
mp_lib: string, mp_lib: string,
uaInfo: UserAgentInfo, uaInfo: UserAgentInfo,
props: Record<string, any>, props: Record<string, any>
) { ) {
// Normalize lib/os/browser data // Normalize lib/os/browser data
const lib = (mp_lib || '').toLowerCase(); const lib = (mp_lib || '').toLowerCase();
const os = String(props.$os || uaInfo.os || '').toLowerCase(); const os = String(props.$os || uaInfo.os || '').toLowerCase();
const browser = String( const browser = String(
props.$browser || uaInfo.browser || '', props.$browser || uaInfo.browser || ''
).toLowerCase(); ).toLowerCase();
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad'; const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
@@ -431,11 +623,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
return !this.isWebEvent(mp_lib); return !this.isWebEvent(mp_lib);
} }
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
const props = rawEvent.properties as Record<string, any>;
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
}
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo { private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
// For mobile events, extract device information from Mixpanel properties // For mobile events, extract device information from Mixpanel properties
const os = props.$os || props.os || ''; const os = props.$os || props.os || '';
@@ -446,19 +633,19 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
return { return {
isServer: true, isServer: true,
os: os, os,
osVersion: osVersion, osVersion,
browser: '', browser: '',
browserVersion: '', browserVersion: '',
device: device, device,
brand: brand, brand,
model: model, model,
}; };
} }
private stripMixpanelProperties( private stripMixpanelProperties(
properties: Record<string, any>, properties: Record<string, any>,
searchParams: Record<string, string>, searchParams: Record<string, string>
): Record<string, any> { ): Record<string, any> {
const strip = [ const strip = [
'time', 'time',
@@ -472,8 +659,8 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
]; ];
const filtered = Object.fromEntries( const filtered = Object.fromEntries(
Object.entries(properties).filter( Object.entries(properties).filter(
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key), ([key]) => !(key.match(/^(\$|mp_|utm_)/) || strip.includes(key))
), )
); );
// Parse JSON strings back to objects/arrays so toDots() can flatten them // Parse JSON strings back to objects/arrays so toDots() can flatten them

View File

@@ -2,10 +2,13 @@ import { randomUUID } from 'node:crypto';
import { Readable } from 'node:stream'; import { Readable } from 'node:stream';
import { pipeline } from 'node:stream/promises'; import { pipeline } from 'node:stream/promises';
import { createBrotliDecompress, createGunzip } from 'node:zlib'; import { createBrotliDecompress, createGunzip } from 'node:zlib';
import { isSameDomain, parsePath } from '@openpanel/common'; import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { generateDeviceId } from '@openpanel/common/server'; import {
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server'; generateDeviceId,
import type { IClickhouseEvent } from '@openpanel/db'; getReferrerWithQuery,
parseReferrer,
} from '@openpanel/common/server';
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger'; import type { ILogger } from '@openpanel/logger';
import type { IUmamiImportConfig } from '@openpanel/validation'; import type { IUmamiImportConfig } from '@openpanel/validation';
import { parse } from 'csv-parse'; import { parse } from 'csv-parse';
@@ -63,7 +66,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
constructor( constructor(
private readonly projectId: string, private readonly projectId: string,
private readonly config: IUmamiImportConfig, private readonly config: IUmamiImportConfig,
private readonly logger?: ILogger, private readonly logger?: ILogger
) { ) {
super(); super();
} }
@@ -82,7 +85,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
signal?: AbortSignal; signal?: AbortSignal;
maxBytes?: number; maxBytes?: number;
maxRows?: number; maxRows?: number;
} = {}, } = {}
): AsyncGenerator<UmamiRawEvent, void, unknown> { ): AsyncGenerator<UmamiRawEvent, void, unknown> {
const { signal, maxBytes, maxRows } = opts; const { signal, maxBytes, maxRows } = opts;
const controller = new AbortController(); const controller = new AbortController();
@@ -95,9 +98,9 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
} }
const res = await fetch(url, { signal: controller.signal }); const res = await fetch(url, { signal: controller.signal });
if (!res.ok || !res.body) { if (!(res.ok && res.body)) {
throw new Error( throw new Error(
`Failed to fetch remote file: ${res.status} ${res.statusText}`, `Failed to fetch remote file: ${res.status} ${res.statusText}`
); );
} }
@@ -108,15 +111,15 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
if ( if (
contentType && contentType &&
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test( !/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
contentType, contentType
) )
) { ) {
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`); this.logger?.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
} }
if (maxBytes && contentLen && contentLen > maxBytes) { if (maxBytes && contentLen && contentLen > maxBytes) {
throw new Error( throw new Error(
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`, `Remote file exceeds size limit (${contentLen} > ${maxBytes})`
); );
} }
@@ -137,9 +140,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
if (seenBytes > maxBytes) { if (seenBytes > maxBytes) {
controller.abort(); controller.abort();
body.destroy( body.destroy(
new Error( new Error(`Stream exceeded size limit (${seenBytes} > ${maxBytes})`)
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
),
); );
} }
}); });
@@ -190,7 +191,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
throw new Error( throw new Error(
`Failed to parse remote file from ${url}: ${ `Failed to parse remote file from ${url}: ${
err instanceof Error ? err.message : String(err) err instanceof Error ? err.message : String(err)
}`, }`
); );
} finally { } finally {
controller.abort(); // ensure fetch stream is torn down controller.abort(); // ensure fetch stream is torn down
@@ -205,7 +206,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent { transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
const projectId = const projectId =
this.config.projectMapper.find( this.config.projectMapper.find(
(mapper) => mapper.from === _rawEvent.website_id, (mapper) => mapper.from === _rawEvent.website_id
)?.to || this.projectId; )?.to || this.projectId;
const rawEvent = zUmamiRawEvent.parse(_rawEvent); const rawEvent = zUmamiRawEvent.parse(_rawEvent);
@@ -261,39 +262,50 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
} }
// Add useful properties from Umami data // Add useful properties from Umami data
if (rawEvent.page_title) properties.__title = rawEvent.page_title; if (rawEvent.page_title) {
if (rawEvent.screen) properties.__screen = rawEvent.screen; properties.__title = rawEvent.page_title;
if (rawEvent.language) properties.__language = rawEvent.language; }
if (rawEvent.utm_source) if (rawEvent.screen) {
properties.__screen = rawEvent.screen;
}
if (rawEvent.language) {
properties.__language = rawEvent.language;
}
if (rawEvent.utm_source) {
properties = assocPath( properties = assocPath(
['__query', 'utm_source'], ['__query', 'utm_source'],
rawEvent.utm_source, rawEvent.utm_source,
properties, properties
); );
if (rawEvent.utm_medium) }
if (rawEvent.utm_medium) {
properties = assocPath( properties = assocPath(
['__query', 'utm_medium'], ['__query', 'utm_medium'],
rawEvent.utm_medium, rawEvent.utm_medium,
properties, properties
); );
if (rawEvent.utm_campaign) }
if (rawEvent.utm_campaign) {
properties = assocPath( properties = assocPath(
['__query', 'utm_campaign'], ['__query', 'utm_campaign'],
rawEvent.utm_campaign, rawEvent.utm_campaign,
properties, properties
); );
if (rawEvent.utm_content) }
if (rawEvent.utm_content) {
properties = assocPath( properties = assocPath(
['__query', 'utm_content'], ['__query', 'utm_content'],
rawEvent.utm_content, rawEvent.utm_content,
properties, properties
); );
if (rawEvent.utm_term) }
if (rawEvent.utm_term) {
properties = assocPath( properties = assocPath(
['__query', 'utm_term'], ['__query', 'utm_term'],
rawEvent.utm_term, rawEvent.utm_term,
properties, properties
); );
}
return { return {
id: rawEvent.event_id || randomUUID(), id: rawEvent.event_id || randomUUID(),
@@ -302,8 +314,8 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
profile_id: profileId, profile_id: profileId,
project_id: projectId, project_id: projectId,
session_id: rawEvent.session_id || '', session_id: rawEvent.session_id || '',
properties, properties: toDots(properties),
created_at: rawEvent.created_at.toISOString(), created_at: formatClickhouseDate(rawEvent.created_at),
country, country,
city, city,
region: this.mapRegion(region), region: this.mapRegion(region),
@@ -329,7 +341,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
} }
mapRegion(region: string): string { mapRegion(region: string): string {
return region.replace(/^[A-Z]{2}\-/, ''); return region.replace(/^[A-Z]{2}-/, '');
} }
mapDevice(device: string): string { mapDevice(device: string): string {