fix: redo how the importer works
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
export * from './src/prisma-client';
|
||||
export * from './src/clickhouse/client';
|
||||
export * from './src/clickhouse/csv';
|
||||
export * from './src/sql-builder';
|
||||
export * from './src/services/chart.service';
|
||||
export * from './src/engine';
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import { Readable } from 'node:stream';
|
||||
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
|
||||
import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
|
||||
import sqlstring from 'sqlstring';
|
||||
|
||||
import type { NodeClickHouseClientConfigOptions } from '@clickhouse/client/dist/config';
|
||||
import { createLogger } from '@openpanel/logger';
|
||||
import type { IInterval } from '@openpanel/validation';
|
||||
import sqlstring from 'sqlstring';
|
||||
|
||||
export { createClient };
|
||||
|
||||
@@ -68,8 +66,11 @@ export const TABLE_NAMES = {
|
||||
* Non-clustered mode = self-hosted environments
|
||||
*/
|
||||
export function isClickhouseClustered(): boolean {
|
||||
if (process.env.CLICKHOUSE_CLUSTER === 'true' || process.env.CLICKHOUSE_CLUSTER === '1') {
|
||||
return true
|
||||
if (
|
||||
process.env.CLICKHOUSE_CLUSTER === 'true' ||
|
||||
process.env.CLICKHOUSE_CLUSTER === '1'
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return !(
|
||||
@@ -97,21 +98,21 @@ function getClickhouseSettings(): ClickHouseSettings {
|
||||
return {
|
||||
distributed_product_mode: 'allow',
|
||||
date_time_input_format: 'best_effort',
|
||||
...(!process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
|
||||
? {
|
||||
...(process.env.CLICKHOUSE_SETTINGS_REMOVE_CONVERT_ANY_JOIN
|
||||
? {}
|
||||
: {
|
||||
query_plan_convert_any_join_to_semi_or_anti_join: 0,
|
||||
}
|
||||
: {}),
|
||||
}),
|
||||
...additionalSettings,
|
||||
};
|
||||
}
|
||||
|
||||
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
|
||||
max_open_connections: 30,
|
||||
request_timeout: 300000,
|
||||
request_timeout: 300_000,
|
||||
keep_alive: {
|
||||
enabled: true,
|
||||
idle_socket_ttl: 60000,
|
||||
idle_socket_ttl: 60_000,
|
||||
},
|
||||
compression: {
|
||||
request: true,
|
||||
@@ -138,7 +139,7 @@ const cleanQuery = (query?: string) =>
|
||||
export async function withRetry<T>(
|
||||
operation: () => Promise<T>,
|
||||
maxRetries = 3,
|
||||
baseDelay = 500,
|
||||
baseDelay = 500
|
||||
): Promise<T> {
|
||||
let lastError: Error | undefined;
|
||||
|
||||
@@ -162,7 +163,7 @@ export async function withRetry<T>(
|
||||
`Attempt ${attempt + 1}/${maxRetries} failed, retrying in ${delay}ms`,
|
||||
{
|
||||
error: error.message,
|
||||
},
|
||||
}
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
continue;
|
||||
@@ -213,7 +214,7 @@ export const ch = new Proxy(originalCh, {
|
||||
|
||||
export async function chQueryWithMeta<T extends Record<string, any>>(
|
||||
query: string,
|
||||
clickhouseSettings?: ClickHouseSettings,
|
||||
clickhouseSettings?: ClickHouseSettings
|
||||
): Promise<ResponseJSON<T>> {
|
||||
const start = Date.now();
|
||||
const res = await ch.query({
|
||||
@@ -249,44 +250,16 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
|
||||
return response;
|
||||
}
|
||||
|
||||
export async function chInsertCSV(tableName: string, rows: string[]) {
|
||||
try {
|
||||
const now = performance.now();
|
||||
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
|
||||
const csvStream = Readable.from(rows.join('\n'), {
|
||||
objectMode: false,
|
||||
});
|
||||
|
||||
await ch.insert({
|
||||
table: tableName,
|
||||
values: csvStream,
|
||||
format: 'CSV',
|
||||
clickhouse_settings: {
|
||||
format_csv_allow_double_quotes: 1,
|
||||
format_csv_allow_single_quotes: 0,
|
||||
},
|
||||
});
|
||||
|
||||
logger.info('CSV Insert successful', {
|
||||
elapsed: performance.now() - now,
|
||||
rows: rows.length,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error('CSV Insert failed:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export async function chQuery<T extends Record<string, any>>(
|
||||
query: string,
|
||||
clickhouseSettings?: ClickHouseSettings,
|
||||
clickhouseSettings?: ClickHouseSettings
|
||||
): Promise<T[]> {
|
||||
return (await chQueryWithMeta<T>(query, clickhouseSettings)).data;
|
||||
}
|
||||
|
||||
export function formatClickhouseDate(
|
||||
date: Date | string,
|
||||
skipTime = false,
|
||||
skipTime = false
|
||||
): string {
|
||||
if (skipTime) {
|
||||
return new Date(date).toISOString().split('T')[0]!;
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
|
||||
// Format: '{'key1':'value1','key2':'value2'}'
|
||||
// Single quotes inside values must be escaped with backslash: \'
|
||||
// We also need to escape newlines and control characters to prevent CSV parsing issues
|
||||
const escapeMapValue = (str: string) => {
|
||||
return str
|
||||
.replace(/\\/g, '\\\\') // Escape backslashes first
|
||||
.replace(/'/g, "\\'") // Escape single quotes
|
||||
.replace(/\n/g, '\\n') // Escape newlines
|
||||
.replace(/\r/g, '\\r') // Escape carriage returns
|
||||
.replace(/\t/g, '\\t') // Escape tabs
|
||||
.replace(/\0/g, '\\0'); // Escape null bytes
|
||||
};
|
||||
|
||||
export const csvEscapeJson = (
|
||||
value: Record<string, unknown> | null | undefined,
|
||||
): string => {
|
||||
if (value == null) return '';
|
||||
|
||||
// Normalize to strings if your column is Map(String,String)
|
||||
const normalized: Record<string, string> = Object.fromEntries(
|
||||
Object.entries(value).map(([k, v]) => [
|
||||
String(k),
|
||||
v == null ? '' : String(v),
|
||||
]),
|
||||
);
|
||||
|
||||
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
|
||||
if (Object.keys(normalized).length === 0) return '{}';
|
||||
|
||||
const pairs = Object.entries(normalized)
|
||||
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
|
||||
.join(',');
|
||||
|
||||
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
|
||||
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
|
||||
return csvEscapeField(`{${pairs}}`);
|
||||
};
|
||||
|
||||
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
|
||||
// Double quotes inside must be doubled (""), per CSV standard
|
||||
export const csvEscapeField = (value: string | number): string => {
|
||||
const str = String(value);
|
||||
|
||||
// If field contains commas, quotes, or newlines, it must be quoted
|
||||
if (/[,"\n\r]/.test(str)) {
|
||||
// Escape double quotes by doubling them
|
||||
const escaped = str.replace(/"/g, '""');
|
||||
return `"${escaped}"`;
|
||||
}
|
||||
|
||||
return str;
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
@@ -39,7 +39,7 @@ describe('mixpanel', () => {
|
||||
const rawEvent = {
|
||||
event: '$mp_web_page_view',
|
||||
properties: {
|
||||
time: 1746097970,
|
||||
time: 1_746_097_970,
|
||||
distinct_id: '$device:123',
|
||||
$browser: 'Chrome',
|
||||
$browser_version: 135,
|
||||
@@ -53,7 +53,7 @@ describe('mixpanel', () => {
|
||||
$insert_id: 'source_id',
|
||||
$lib_version: '2.60.0',
|
||||
$mp_api_endpoint: 'api-js.mixpanel.com',
|
||||
$mp_api_timestamp_ms: 1746078175363,
|
||||
$mp_api_timestamp_ms: 1_746_078_175_363,
|
||||
$mp_autocapture: true,
|
||||
$os: 'Android',
|
||||
$referrer: 'https://google.com/',
|
||||
@@ -71,7 +71,7 @@ describe('mixpanel', () => {
|
||||
gclid: 'oqneoqow',
|
||||
mp_country_code: 'IN',
|
||||
mp_lib: 'web',
|
||||
mp_processing_time_ms: 1746078175546,
|
||||
mp_processing_time_ms: 1_746_078_175_546,
|
||||
mp_sent_by_lib_version: '2.60.0',
|
||||
utm_medium: 'cpc',
|
||||
utm_source: 'google',
|
||||
@@ -101,7 +101,7 @@ describe('mixpanel', () => {
|
||||
__title:
|
||||
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
|
||||
},
|
||||
created_at: '2025-05-01T11:12:50.000Z',
|
||||
created_at: '2025-05-01 11:12:50',
|
||||
country: 'IN',
|
||||
city: 'Mumbai',
|
||||
region: 'Maharashtra',
|
||||
@@ -110,7 +110,7 @@ describe('mixpanel', () => {
|
||||
os: 'Android',
|
||||
os_version: undefined,
|
||||
browser: 'Chrome',
|
||||
browser_version: '',
|
||||
browser_version: '135',
|
||||
device: 'mobile',
|
||||
brand: '',
|
||||
model: '',
|
||||
@@ -141,7 +141,7 @@ describe('mixpanel', () => {
|
||||
const rawEvent = {
|
||||
event: 'custom_event',
|
||||
properties: {
|
||||
time: 1746097970,
|
||||
time: 1_746_097_970,
|
||||
distinct_id: '$device:123',
|
||||
$device_id: '123',
|
||||
$user_id: 'user123',
|
||||
@@ -192,7 +192,7 @@ describe('mixpanel', () => {
|
||||
const rawEvent = {
|
||||
event: 'ec_search_error',
|
||||
properties: {
|
||||
time: 1759947367,
|
||||
time: 1_759_947_367,
|
||||
distinct_id: '3385916',
|
||||
$browser: 'Mobile Safari',
|
||||
$browser_version: null,
|
||||
@@ -207,7 +207,7 @@ describe('mixpanel', () => {
|
||||
$insert_id: 'bclkaepeqcfuzt4v',
|
||||
$lib_version: '2.60.0',
|
||||
$mp_api_endpoint: 'api-js.mixpanel.com',
|
||||
$mp_api_timestamp_ms: 1759927570699,
|
||||
$mp_api_timestamp_ms: 1_759_927_570_699,
|
||||
$os: 'iOS',
|
||||
$region: 'Karnataka',
|
||||
$screen_height: 852,
|
||||
@@ -225,7 +225,7 @@ describe('mixpanel', () => {
|
||||
language: 'english',
|
||||
mp_country_code: 'IN',
|
||||
mp_lib: 'web',
|
||||
mp_processing_time_ms: 1759927592421,
|
||||
mp_processing_time_ms: 1_759_927_592_421,
|
||||
mp_sent_by_lib_version: '2.60.0',
|
||||
os: 'web',
|
||||
osVersion:
|
||||
@@ -249,15 +249,15 @@ describe('mixpanel', () => {
|
||||
|
||||
expect(res.id.length).toBeGreaterThan(30);
|
||||
expect(res.imported_at).toMatch(
|
||||
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
|
||||
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
|
||||
);
|
||||
expect(omit(['id', 'imported_at'], res)).toEqual({
|
||||
brand: 'Apple',
|
||||
browser: 'GSA',
|
||||
browser_version: 'null',
|
||||
browser_version: '388.0.811331708',
|
||||
city: 'Bengaluru',
|
||||
country: 'IN',
|
||||
created_at: '2025-10-08T18:16:07.000Z',
|
||||
created_at: '2025-10-08 18:16:07',
|
||||
device: 'mobile',
|
||||
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
|
||||
duration: 0,
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
|
||||
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server';
|
||||
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
|
||||
import type { IClickhouseEvent } from '@openpanel/db';
|
||||
import {
|
||||
getReferrerWithQuery,
|
||||
parseReferrer,
|
||||
parseUserAgent,
|
||||
type UserAgentInfo,
|
||||
} from '@openpanel/common/server';
|
||||
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
|
||||
import type { IClickhouseProfile } from '@openpanel/db';
|
||||
import type { ILogger } from '@openpanel/logger';
|
||||
import type { IMixpanelImportConfig } from '@openpanel/validation';
|
||||
import { z } from 'zod';
|
||||
@@ -15,22 +20,88 @@ export const zMixpanelRawEvent = z.object({
|
||||
|
||||
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
|
||||
|
||||
/** Engage API profile: https://docs.mixpanel.com/docs/export-methods#exporting-profiles */
|
||||
export const zMixpanelRawProfile = z.object({
|
||||
$distinct_id: z.union([z.string(), z.number()]),
|
||||
$properties: z.record(z.unknown()).optional().default({}),
|
||||
});
|
||||
export type MixpanelRawProfile = z.infer<typeof zMixpanelRawProfile>;
|
||||
|
||||
class MixpanelRateLimitError extends Error {
|
||||
readonly retryAfterMs?: number;
|
||||
|
||||
constructor(message: string, retryAfterMs?: number) {
|
||||
super(message);
|
||||
this.name = 'MixpanelRateLimitError';
|
||||
this.retryAfterMs = retryAfterMs;
|
||||
}
|
||||
}
|
||||
|
||||
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
provider = 'mixpanel';
|
||||
version = '1.0.0';
|
||||
|
||||
private static readonly MAX_REQUESTS_PER_HOUR = 100;
|
||||
private static readonly MIN_REQUEST_INTERVAL_MS = 334; // 3 QPS limit
|
||||
private requestTimestamps: number[] = [];
|
||||
private lastRequestTime = 0;
|
||||
|
||||
constructor(
|
||||
private readonly projectId: string,
|
||||
private readonly config: IMixpanelImportConfig,
|
||||
private readonly logger?: ILogger,
|
||||
private readonly logger?: ILogger
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
async getTotalEventsCount(): Promise<number> {
|
||||
private async waitForRateLimit(): Promise<void> {
|
||||
const now = Date.now();
|
||||
const oneHourAgo = now - 60 * 60 * 1000;
|
||||
|
||||
// Prune timestamps older than 1 hour
|
||||
this.requestTimestamps = this.requestTimestamps.filter(
|
||||
(t) => t > oneHourAgo
|
||||
);
|
||||
|
||||
// Enforce per-second limit (3 QPS → min 334ms gap)
|
||||
const timeSinceLast = now - this.lastRequestTime;
|
||||
if (timeSinceLast < MixpanelProvider.MIN_REQUEST_INTERVAL_MS) {
|
||||
const delay = MixpanelProvider.MIN_REQUEST_INTERVAL_MS - timeSinceLast;
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
// Enforce hourly limit
|
||||
if (
|
||||
this.requestTimestamps.length >= MixpanelProvider.MAX_REQUESTS_PER_HOUR
|
||||
) {
|
||||
const oldestInWindow = this.requestTimestamps[0]!;
|
||||
const waitUntil = oldestInWindow + 60 * 60 * 1000;
|
||||
const waitMs = waitUntil - Date.now() + 1000; // +1s buffer
|
||||
|
||||
if (waitMs > 0) {
|
||||
this.logger?.info(
|
||||
`Rate limit: ${this.requestTimestamps.length} requests in the last hour, waiting ${Math.ceil(waitMs / 1000)}s`,
|
||||
{
|
||||
requestsInWindow: this.requestTimestamps.length,
|
||||
waitMs,
|
||||
}
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, waitMs));
|
||||
// Prune again after waiting
|
||||
this.requestTimestamps = this.requestTimestamps.filter(
|
||||
(t) => t > Date.now() - 60 * 60 * 1000
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
this.lastRequestTime = Date.now();
|
||||
this.requestTimestamps.push(Date.now());
|
||||
}
|
||||
|
||||
getTotalEventsCount(): Promise<number> {
|
||||
// Mixpanel sucks and dont provide a good way to extract total event count within a period
|
||||
// jql would work but not accurate and will be deprecated end of 2025
|
||||
return -1;
|
||||
return Promise.resolve(-1);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -42,13 +113,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
}
|
||||
|
||||
async *parseSource(
|
||||
overrideFrom?: string,
|
||||
overrideFrom?: string
|
||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||
yield* this.fetchEventsFromMixpanel(overrideFrom);
|
||||
}
|
||||
|
||||
private async *fetchEventsFromMixpanel(
|
||||
overrideFrom?: string,
|
||||
overrideFrom?: string
|
||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
|
||||
|
||||
@@ -58,20 +129,24 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
|
||||
for (const [chunkFrom, chunkTo] of dateChunks) {
|
||||
let retries = 0;
|
||||
const maxRetries = 3;
|
||||
const maxRetries = 6;
|
||||
|
||||
while (retries <= maxRetries) {
|
||||
try {
|
||||
await this.waitForRateLimit();
|
||||
yield* this.fetchEventsForDateRange(
|
||||
serviceAccount,
|
||||
serviceSecret,
|
||||
projectId,
|
||||
chunkFrom,
|
||||
chunkTo,
|
||||
chunkTo
|
||||
);
|
||||
break; // Success, move to next chunk
|
||||
} catch (error) {
|
||||
retries++;
|
||||
const isRateLimit =
|
||||
error instanceof MixpanelRateLimitError ||
|
||||
(error instanceof Error && error.message.includes('429'));
|
||||
const isLastRetry = retries > maxRetries;
|
||||
|
||||
this.logger?.warn('Failed to fetch events for date range', {
|
||||
@@ -80,22 +155,31 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
attempt: retries,
|
||||
maxRetries,
|
||||
error: (error as Error).message,
|
||||
isRateLimit,
|
||||
willRetry: !isLastRetry,
|
||||
});
|
||||
|
||||
if (isLastRetry) {
|
||||
// Final attempt failed, re-throw
|
||||
throw new Error(
|
||||
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`,
|
||||
`Failed to fetch Mixpanel events for ${chunkFrom} to ${chunkTo} after ${maxRetries} retries: ${(error as Error).message}`
|
||||
);
|
||||
}
|
||||
|
||||
// Exponential backoff: wait before retrying
|
||||
const delay = Math.min(1000 * 2 ** (retries - 1), 60_000); // Cap at 1 minute
|
||||
let delay: number;
|
||||
if (error instanceof MixpanelRateLimitError && error.retryAfterMs) {
|
||||
delay = error.retryAfterMs;
|
||||
} else if (isRateLimit) {
|
||||
// 5min → 10min → 15min → 15min → 15min = 60min total
|
||||
delay = Math.min(300_000 * 2 ** (retries - 1), 900_000);
|
||||
} else {
|
||||
delay = Math.min(1000 * 2 ** (retries - 1), 60_000);
|
||||
}
|
||||
|
||||
this.logger?.info('Retrying after delay', {
|
||||
delayMs: delay,
|
||||
chunkFrom,
|
||||
chunkTo,
|
||||
isRateLimit,
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
@@ -108,7 +192,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
serviceSecret: string,
|
||||
projectId: string,
|
||||
from: string,
|
||||
to: string,
|
||||
to: string
|
||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||
const url = 'https://data.mixpanel.com/api/2.0/export';
|
||||
|
||||
@@ -134,9 +218,18 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
},
|
||||
});
|
||||
|
||||
if (response.status === 429) {
|
||||
const retryAfter = response.headers.get('Retry-After');
|
||||
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
|
||||
throw new MixpanelRateLimitError(
|
||||
'Mixpanel rate limit exceeded (429)',
|
||||
retryAfterMs
|
||||
);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`,
|
||||
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`
|
||||
);
|
||||
}
|
||||
|
||||
@@ -153,7 +246,9 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
|
||||
if (done) break;
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
|
||||
@@ -187,7 +282,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
{
|
||||
line: buffer.substring(0, 100),
|
||||
error,
|
||||
},
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -196,6 +291,114 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stream user profiles from Mixpanel Engage API.
|
||||
* Paginates with page/page_size (5k per page) and yields each profile.
|
||||
*/
|
||||
async *streamProfiles(): AsyncGenerator<MixpanelRawProfile, void, unknown> {
|
||||
const { serviceAccount, serviceSecret, projectId } = this.config;
|
||||
const pageSize = 5000;
|
||||
let page = 0;
|
||||
|
||||
while (true) {
|
||||
await this.waitForRateLimit();
|
||||
|
||||
const url = `https://mixpanel.com/api/query/engage?project_id=${encodeURIComponent(projectId)}`;
|
||||
const body = new URLSearchParams({
|
||||
page: String(page),
|
||||
page_size: String(pageSize),
|
||||
});
|
||||
|
||||
this.logger?.info('Fetching profiles from Mixpanel Engage', {
|
||||
page,
|
||||
page_size: pageSize,
|
||||
projectId,
|
||||
});
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
|
||||
Accept: 'application/json',
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
body: body.toString(),
|
||||
});
|
||||
|
||||
if (response.status === 429) {
|
||||
const retryAfter = response.headers.get('Retry-After');
|
||||
const retryAfterMs = retryAfter ? Number(retryAfter) * 1000 : undefined;
|
||||
throw new MixpanelRateLimitError(
|
||||
'Mixpanel rate limit exceeded (429)',
|
||||
retryAfterMs
|
||||
);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const text = await response.text();
|
||||
throw new Error(
|
||||
`Failed to fetch profiles from Mixpanel: ${response.status} ${response.statusText} - ${text}`
|
||||
);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
results?: Array<{ $distinct_id: string | number; $properties?: Record<string, unknown> }>;
|
||||
page?: number;
|
||||
total?: number;
|
||||
};
|
||||
|
||||
const results = data.results ?? [];
|
||||
for (const row of results) {
|
||||
const parsed = zMixpanelRawProfile.safeParse(row);
|
||||
if (parsed.success) {
|
||||
yield parsed.data;
|
||||
} else {
|
||||
this.logger?.warn('Skipping invalid Mixpanel profile', {
|
||||
row: JSON.stringify(row).slice(0, 200),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (results.length < pageSize) {
|
||||
break;
|
||||
}
|
||||
page++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Map Mixpanel Engage profile to OpenPanel IClickhouseProfile.
|
||||
*/
|
||||
transformProfile(raw: MixpanelRawProfile): IClickhouseProfile {
|
||||
const parsed = zMixpanelRawProfile.parse(raw);
|
||||
const props = (parsed.$properties || {}) as Record<string, unknown>;
|
||||
|
||||
const id = String(parsed.$distinct_id).replace(/^\$device:/, '');
|
||||
const createdAt = props.$created
|
||||
? formatClickhouseDate(new Date(String(props.$created)))
|
||||
: formatClickhouseDate(new Date());
|
||||
|
||||
const properties: Record<string, string> = {};
|
||||
const stripPrefix = /^\$/;
|
||||
for (const [key, value] of Object.entries(props)) {
|
||||
if (stripPrefix.test(key)) continue;
|
||||
if (value == null) continue;
|
||||
properties[key] = typeof value === 'object' ? JSON.stringify(value) : String(value);
|
||||
}
|
||||
|
||||
return {
|
||||
id,
|
||||
project_id: this.projectId,
|
||||
first_name: String(props.$first_name ?? ''),
|
||||
last_name: String(props.$last_name ?? ''),
|
||||
email: String(props.$email ?? ''),
|
||||
avatar: String(props.$avatar ?? props.$image ?? ''),
|
||||
properties,
|
||||
created_at: createdAt,
|
||||
is_external: true,
|
||||
};
|
||||
}
|
||||
|
||||
validate(rawEvent: MixpanelRawEvent): boolean {
|
||||
const res = zMixpanelRawEvent.safeParse(rawEvent);
|
||||
return res.success;
|
||||
@@ -208,7 +411,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
const deviceId = props.$device_id;
|
||||
const profileId = String(props.$user_id || props.distinct_id).replace(
|
||||
/^\$device:/,
|
||||
'',
|
||||
''
|
||||
);
|
||||
|
||||
// Build full URL from current_url and current_url_search (web only)
|
||||
@@ -309,7 +512,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
project_id: projectId,
|
||||
session_id: '', // Will be generated in SQL after import
|
||||
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
|
||||
created_at: new Date(props.time * 1000).toISOString(),
|
||||
created_at: formatClickhouseDate(new Date(props.time * 1000)),
|
||||
country,
|
||||
city,
|
||||
region,
|
||||
@@ -318,10 +521,7 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
os: uaInfo.os || props.$os,
|
||||
os_version: uaInfo.osVersion || props.$osVersion,
|
||||
browser: uaInfo.browser || props.$browser,
|
||||
browser_version:
|
||||
uaInfo.browserVersion || props.$browserVersion
|
||||
? String(props.$browser_version)
|
||||
: '',
|
||||
browser_version: uaInfo.browserVersion || String(props.$browser_version ?? ''),
|
||||
device: this.getDeviceType(props.mp_lib, uaInfo, props),
|
||||
brand: uaInfo.brand || '',
|
||||
model: uaInfo.model || '',
|
||||
@@ -338,14 +538,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
sdk_version: this.version,
|
||||
};
|
||||
|
||||
// TODO: Remove this
|
||||
// Temporary fix for a client
|
||||
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
|
||||
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
|
||||
event.name = 'screen_view';
|
||||
event.path = isMightBeScreenView;
|
||||
}
|
||||
|
||||
// TODO: Remove this
|
||||
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
|
||||
if (props.utm_source && !properties.__query?.utm_source) {
|
||||
@@ -371,13 +563,13 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
private getDeviceType(
|
||||
mp_lib: string,
|
||||
uaInfo: UserAgentInfo,
|
||||
props: Record<string, any>,
|
||||
props: Record<string, any>
|
||||
) {
|
||||
// Normalize lib/os/browser data
|
||||
const lib = (mp_lib || '').toLowerCase();
|
||||
const os = String(props.$os || uaInfo.os || '').toLowerCase();
|
||||
const browser = String(
|
||||
props.$browser || uaInfo.browser || '',
|
||||
props.$browser || uaInfo.browser || ''
|
||||
).toLowerCase();
|
||||
|
||||
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
|
||||
@@ -431,11 +623,6 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
return !this.isWebEvent(mp_lib);
|
||||
}
|
||||
|
||||
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
|
||||
const props = rawEvent.properties as Record<string, any>;
|
||||
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
|
||||
}
|
||||
|
||||
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
|
||||
// For mobile events, extract device information from Mixpanel properties
|
||||
const os = props.$os || props.os || '';
|
||||
@@ -446,19 +633,19 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
|
||||
return {
|
||||
isServer: true,
|
||||
os: os,
|
||||
osVersion: osVersion,
|
||||
os,
|
||||
osVersion,
|
||||
browser: '',
|
||||
browserVersion: '',
|
||||
device: device,
|
||||
brand: brand,
|
||||
model: model,
|
||||
device,
|
||||
brand,
|
||||
model,
|
||||
};
|
||||
}
|
||||
|
||||
private stripMixpanelProperties(
|
||||
properties: Record<string, any>,
|
||||
searchParams: Record<string, string>,
|
||||
searchParams: Record<string, string>
|
||||
): Record<string, any> {
|
||||
const strip = [
|
||||
'time',
|
||||
@@ -472,8 +659,8 @@ export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
];
|
||||
const filtered = Object.fromEntries(
|
||||
Object.entries(properties).filter(
|
||||
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key),
|
||||
),
|
||||
([key]) => !(key.match(/^(\$|mp_|utm_)/) || strip.includes(key))
|
||||
)
|
||||
);
|
||||
|
||||
// Parse JSON strings back to objects/arrays so toDots() can flatten them
|
||||
|
||||
@@ -2,10 +2,13 @@ import { randomUUID } from 'node:crypto';
|
||||
import { Readable } from 'node:stream';
|
||||
import { pipeline } from 'node:stream/promises';
|
||||
import { createBrotliDecompress, createGunzip } from 'node:zlib';
|
||||
import { isSameDomain, parsePath } from '@openpanel/common';
|
||||
import { generateDeviceId } from '@openpanel/common/server';
|
||||
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
|
||||
import type { IClickhouseEvent } from '@openpanel/db';
|
||||
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
|
||||
import {
|
||||
generateDeviceId,
|
||||
getReferrerWithQuery,
|
||||
parseReferrer,
|
||||
} from '@openpanel/common/server';
|
||||
import { formatClickhouseDate, type IClickhouseEvent } from '@openpanel/db';
|
||||
import type { ILogger } from '@openpanel/logger';
|
||||
import type { IUmamiImportConfig } from '@openpanel/validation';
|
||||
import { parse } from 'csv-parse';
|
||||
@@ -63,7 +66,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
constructor(
|
||||
private readonly projectId: string,
|
||||
private readonly config: IUmamiImportConfig,
|
||||
private readonly logger?: ILogger,
|
||||
private readonly logger?: ILogger
|
||||
) {
|
||||
super();
|
||||
}
|
||||
@@ -82,7 +85,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
signal?: AbortSignal;
|
||||
maxBytes?: number;
|
||||
maxRows?: number;
|
||||
} = {},
|
||||
} = {}
|
||||
): AsyncGenerator<UmamiRawEvent, void, unknown> {
|
||||
const { signal, maxBytes, maxRows } = opts;
|
||||
const controller = new AbortController();
|
||||
@@ -95,9 +98,9 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
}
|
||||
|
||||
const res = await fetch(url, { signal: controller.signal });
|
||||
if (!res.ok || !res.body) {
|
||||
if (!(res.ok && res.body)) {
|
||||
throw new Error(
|
||||
`Failed to fetch remote file: ${res.status} ${res.statusText}`,
|
||||
`Failed to fetch remote file: ${res.status} ${res.statusText}`
|
||||
);
|
||||
}
|
||||
|
||||
@@ -108,15 +111,15 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
if (
|
||||
contentType &&
|
||||
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
|
||||
contentType,
|
||||
contentType
|
||||
)
|
||||
) {
|
||||
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
|
||||
this.logger?.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
|
||||
}
|
||||
|
||||
if (maxBytes && contentLen && contentLen > maxBytes) {
|
||||
throw new Error(
|
||||
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`,
|
||||
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`
|
||||
);
|
||||
}
|
||||
|
||||
@@ -137,9 +140,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
if (seenBytes > maxBytes) {
|
||||
controller.abort();
|
||||
body.destroy(
|
||||
new Error(
|
||||
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
|
||||
),
|
||||
new Error(`Stream exceeded size limit (${seenBytes} > ${maxBytes})`)
|
||||
);
|
||||
}
|
||||
});
|
||||
@@ -190,7 +191,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
throw new Error(
|
||||
`Failed to parse remote file from ${url}: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
}`
|
||||
);
|
||||
} finally {
|
||||
controller.abort(); // ensure fetch stream is torn down
|
||||
@@ -205,7 +206,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
|
||||
const projectId =
|
||||
this.config.projectMapper.find(
|
||||
(mapper) => mapper.from === _rawEvent.website_id,
|
||||
(mapper) => mapper.from === _rawEvent.website_id
|
||||
)?.to || this.projectId;
|
||||
|
||||
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
|
||||
@@ -261,39 +262,50 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
}
|
||||
|
||||
// Add useful properties from Umami data
|
||||
if (rawEvent.page_title) properties.__title = rawEvent.page_title;
|
||||
if (rawEvent.screen) properties.__screen = rawEvent.screen;
|
||||
if (rawEvent.language) properties.__language = rawEvent.language;
|
||||
if (rawEvent.utm_source)
|
||||
if (rawEvent.page_title) {
|
||||
properties.__title = rawEvent.page_title;
|
||||
}
|
||||
if (rawEvent.screen) {
|
||||
properties.__screen = rawEvent.screen;
|
||||
}
|
||||
if (rawEvent.language) {
|
||||
properties.__language = rawEvent.language;
|
||||
}
|
||||
if (rawEvent.utm_source) {
|
||||
properties = assocPath(
|
||||
['__query', 'utm_source'],
|
||||
rawEvent.utm_source,
|
||||
properties,
|
||||
properties
|
||||
);
|
||||
if (rawEvent.utm_medium)
|
||||
}
|
||||
if (rawEvent.utm_medium) {
|
||||
properties = assocPath(
|
||||
['__query', 'utm_medium'],
|
||||
rawEvent.utm_medium,
|
||||
properties,
|
||||
properties
|
||||
);
|
||||
if (rawEvent.utm_campaign)
|
||||
}
|
||||
if (rawEvent.utm_campaign) {
|
||||
properties = assocPath(
|
||||
['__query', 'utm_campaign'],
|
||||
rawEvent.utm_campaign,
|
||||
properties,
|
||||
properties
|
||||
);
|
||||
if (rawEvent.utm_content)
|
||||
}
|
||||
if (rawEvent.utm_content) {
|
||||
properties = assocPath(
|
||||
['__query', 'utm_content'],
|
||||
rawEvent.utm_content,
|
||||
properties,
|
||||
properties
|
||||
);
|
||||
if (rawEvent.utm_term)
|
||||
}
|
||||
if (rawEvent.utm_term) {
|
||||
properties = assocPath(
|
||||
['__query', 'utm_term'],
|
||||
rawEvent.utm_term,
|
||||
properties,
|
||||
properties
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
id: rawEvent.event_id || randomUUID(),
|
||||
@@ -302,8 +314,8 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
profile_id: profileId,
|
||||
project_id: projectId,
|
||||
session_id: rawEvent.session_id || '',
|
||||
properties,
|
||||
created_at: rawEvent.created_at.toISOString(),
|
||||
properties: toDots(properties),
|
||||
created_at: formatClickhouseDate(rawEvent.created_at),
|
||||
country,
|
||||
city,
|
||||
region: this.mapRegion(region),
|
||||
@@ -329,7 +341,7 @@ export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
}
|
||||
|
||||
mapRegion(region: string): string {
|
||||
return region.replace(/^[A-Z]{2}\-/, '');
|
||||
return region.replace(/^[A-Z]{2}-/, '');
|
||||
}
|
||||
|
||||
mapDevice(device: string): string {
|
||||
|
||||
Reference in New Issue
Block a user