feat: new importer (#214)

This commit is contained in:
Carl-Gerhard Lindesvärd
2025-11-05 09:49:36 +01:00
committed by GitHub
parent b51bc8f3f6
commit 212254d31a
80 changed files with 4884 additions and 842 deletions

View File

@@ -0,0 +1,121 @@
import type { IClickhouseEvent } from '@openpanel/db';
import type { BaseRawEvent, ErrorContext, ImportJobMetadata } from './types';
export abstract class BaseImportProvider<
TRawEvent extends BaseRawEvent = BaseRawEvent,
> {
abstract provider: string;
abstract version: string;
/**
* Stream-read and parse source (file/API) → yields raw events
* This should be implemented as an async generator to handle large files efficiently
*/
abstract parseSource(
overrideFrom?: string,
): AsyncGenerator<TRawEvent, void, unknown>;
/**
* Convert provider format → IClickhouseEvent
*/
abstract transformEvent(rawEvent: TRawEvent): IClickhouseEvent;
/**
* Validate raw event structure
*/
abstract validate(rawEvent: TRawEvent): boolean;
/**
* Returns how many events will be imported
*/
abstract getTotalEventsCount(): Promise<number>;
/**
* Optional hook: Pre-process batch
*/
async beforeBatch?(events: TRawEvent[]): Promise<TRawEvent[]> {
return events;
}
/**
* Optional hook: Get import metadata for tracking
*/
getImportMetadata?(): ImportJobMetadata;
/**
* Optional hook: Custom error handling
*/
async onError?(error: Error, context?: ErrorContext): Promise<void> {
// Default: re-throw
throw error;
}
/**
* Get estimated total events (optional, for progress tracking)
*/
async getEstimatedTotal?(): Promise<number> {
return 0;
}
/**
* Indicates whether session IDs should be generated in SQL after import
* If true, the import job will generate deterministic session IDs based on
* device_id and timestamp using SQL window functions
* If false, assumes the provider already generates session IDs during streaming
*/
shouldGenerateSessionIds(): boolean {
return false; // Default: assume provider handles it
}
/**
* Utility: Split a date range into chunks to avoid timeout issues with large imports
* Returns array of [from, to] date pairs in YYYY-MM-DD format
*
* @param from - Start date in YYYY-MM-DD format
* @param to - End date in YYYY-MM-DD format
* @param chunkSizeDays - Number of days per chunk (default: 1)
*/
public getDateChunks(
from: string,
to: string,
options?: {
chunkSizeDays?: number;
},
): Array<[string, string]> {
const chunks: Array<[string, string]> = [];
const startDate = new Date(from);
const endDate = new Date(to);
const chunkSizeDays = options?.chunkSizeDays ?? 1;
// Handle case where from and to are the same date
if (startDate.getTime() === endDate.getTime()) {
return [[from, to]];
}
const cursor = new Date(startDate);
while (cursor <= endDate) {
const chunkStart = cursor.toISOString().split('T')[0]!;
// Calculate chunk end: move forward by (chunkSizeDays - 1) to get the last day of the chunk
const chunkEndDate = new Date(cursor);
chunkEndDate.setDate(chunkEndDate.getDate() + (chunkSizeDays - 1));
// Don't go past the end date
const chunkEnd =
chunkEndDate > endDate
? endDate.toISOString().split('T')[0]!
: chunkEndDate.toISOString().split('T')[0]!;
chunks.push([chunkStart, chunkEnd]);
// Move cursor to the next chunk start (after the current chunk)
cursor.setDate(cursor.getDate() + chunkSizeDays);
if (cursor > endDate) break;
}
return chunks;
}
}

View File

@@ -0,0 +1,13 @@
export { UmamiProvider } from './providers/umami';
export { MixpanelProvider } from './providers/mixpanel';
export type {
ImportConfig,
ImportProgress,
ImportResult,
BatchResult,
BaseRawEvent,
ErrorContext,
EventProperties,
ImportJobMetadata,
ImportStageResult,
} from './types';

View File

@@ -0,0 +1,30 @@
export type ImportProviderId = 'umami' | 'mixpanel';
export type ImportProviderType = 'file' | 'api';
export interface ImportProviderMeta {
id: ImportProviderId;
name: string;
description: string;
logo: string;
backgroundColor: string;
types: ImportProviderType[];
}
export const IMPORT_PROVIDERS: ImportProviderMeta[] = [
{
id: 'umami',
name: 'Umami',
description: 'Import your analytics data from Umami',
logo: 'https://cdn.brandfetch.io/id_3VEohOm/w/180/h/180/theme/dark/logo.png?c=1dxbfHSJFAPEGdCLU4o5B',
backgroundColor: '#fff',
types: ['file'],
},
{
id: 'mixpanel',
name: 'Mixpanel',
description: 'Import your analytics data from Mixpanel API',
logo: 'https://cdn.brandfetch.io/idr_rhI2FS/theme/dark/idMJ8uODLv.svg?c=1dxbfHSJFAPEGdCLU4o5B',
backgroundColor: '#fff',
types: ['api'],
},
];

View File

@@ -0,0 +1,319 @@
import { omit } from 'ramda';
import { describe, expect, it } from 'vitest';
import { MixpanelProvider } from './mixpanel';
describe('mixpanel', () => {
it('should chunk date range into day chunks', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-04',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const chunks = provider.getDateChunks('2025-01-01', '2025-01-04');
expect(chunks).toEqual([
['2025-01-01', '2025-01-01'],
['2025-01-02', '2025-01-02'],
['2025-01-03', '2025-01-03'],
['2025-01-04', '2025-01-04'],
]);
});
it('should transform event', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-02',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const rawEvent = {
event: '$mp_web_page_view',
properties: {
time: 1746097970,
distinct_id: '$device:123',
$browser: 'Chrome',
$browser_version: 135,
$city: 'Mumbai',
$current_url:
'https://domain.com/state/maharashtra?utm_source=google&utm_medium=cpc&utm_campaignid=890&utm_adgroupid=&utm_adid=&utm_term=&utm_device=m&utm_network=x&utm_location=123&gclid=oqneoqow&gad_sour',
$device: 'Android',
$device_id: '123',
$initial_referrer: 'https://referrer.com/',
$initial_referring_domain: 'referrer.com',
$insert_id: 'source_id',
$lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1746078175363,
$mp_autocapture: true,
$os: 'Android',
$referrer: 'https://google.com/',
$referring_domain: 'referrer.com',
$region: 'Maharashtra',
$screen_height: 854,
$screen_width: 384,
current_domain: 'domain.com',
current_page_title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
current_url_path: '/state/maharashtra',
current_url_protocol: 'https:',
current_url_search:
'?utm_source=google&utm_medium=cpc&utm_campaignid=890&utm_adgroupid=&utm_adid=&utm_term=&utm_device=m&utm_network=x&utm_location=123&gclid=oqneoqow&gad_source=5&gclid=EAIaIQobChMI6MnvhciBjQMVlS-DAx',
gclid: 'oqneoqow',
mp_country_code: 'IN',
mp_lib: 'web',
mp_processing_time_ms: 1746078175546,
mp_sent_by_lib_version: '2.60.0',
utm_medium: 'cpc',
utm_source: 'google',
},
};
const res = provider.transformEvent(rawEvent);
expect(res).toMatchObject({
id: expect.any(String),
name: 'screen_view',
device_id: '123',
profile_id: '123',
project_id: 'pid',
session_id: '',
properties: {
__source_insert_id: 'source_id',
__screen: '384x854',
__lib_version: '2.60.0',
'__query.utm_source': 'google',
'__query.utm_medium': 'cpc',
'__query.utm_campaignid': '890',
'__query.utm_device': 'm',
'__query.utm_network': 'x',
'__query.utm_location': '123',
'__query.gclid': 'oqneoqow',
__title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
},
created_at: '2025-05-01T11:12:50.000Z',
country: 'IN',
city: 'Mumbai',
region: 'Maharashtra',
longitude: null,
latitude: null,
os: 'Android',
os_version: undefined,
browser: 'Chrome',
browser_version: '',
device: 'mobile',
brand: '',
model: '',
duration: 0,
path: '/state/maharashtra',
origin: 'https://domain.com',
referrer: 'https://referrer.com',
referrer_name: 'Google',
referrer_type: 'search',
imported_at: expect.any(String),
sdk_name: 'mixpanel (web)',
sdk_version: '1.0.0',
});
});
it('should parse stringified JSON in properties and flatten them', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-02',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const rawEvent = {
event: 'custom_event',
properties: {
time: 1746097970,
distinct_id: '$device:123',
$device_id: '123',
$user_id: 'user123',
mp_lib: 'web',
// Stringified JSON object - should be parsed and flattened
area: '{"displayText":"Malab, Nuh, Mewat","id":1189005}',
// Stringified JSON array - should be parsed and flattened
tags: '["tag1","tag2","tag3"]',
// Regular string - should remain as is
regularString: 'just a string',
// Number - should be converted to string
count: 42,
// Object - should be flattened
nested: { level1: { level2: 'value' } },
},
};
const res = provider.transformEvent(rawEvent);
expect(res.properties).toMatchObject({
// Parsed JSON object should be flattened with dot notation
'area.displayText': 'Malab, Nuh, Mewat',
'area.id': '1189005',
// Parsed JSON array should be flattened with numeric indices
'tags.0': 'tag1',
'tags.1': 'tag2',
'tags.2': 'tag3',
// Regular values
regularString: 'just a string',
count: '42',
// Nested object flattened
'nested.level1.level2': 'value',
});
});
it('should handle react-native referrer', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-02',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const rawEvent = {
event: 'ec_search_error',
properties: {
time: 1759947367,
distinct_id: '3385916',
$browser: 'Mobile Safari',
$browser_version: null,
$city: 'Bengaluru',
$current_url:
'https://web.landeed.com/karnataka/ec-encumbrance-certificate',
$device: 'iPhone',
$device_id:
'199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
$initial_referrer: 'https://www.google.com/',
$initial_referring_domain: 'www.google.com',
$insert_id: 'bclkaepeqcfuzt4v',
$lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1759927570699,
$os: 'iOS',
$region: 'Karnataka',
$screen_height: 852,
$screen_width: 393,
$search_engine: 'google',
$user_id: '3385916',
binaryReadableVersion: 'NA',
binaryVersion: 'NA',
component: '/karnataka/ec-encumbrance-certificate',
errMsg: 'Request failed with status code 500',
errType: 'SERVER_ERROR',
isSilentSearch: false,
isTimeout: false,
jsVersion: '0.42.0',
language: 'english',
mp_country_code: 'IN',
mp_lib: 'web',
mp_processing_time_ms: 1759927592421,
mp_sent_by_lib_version: '2.60.0',
os: 'web',
osVersion:
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
phoneBrand: 'NA',
phoneManufacturer: 'NA',
phoneModel: 'NA',
searchUuid: '68e65d08-fd81-4ded-37d3-2b08d2bc70c3',
serverVersion: 'web2.0',
state: 17,
stateStr: '17',
statusCode: 500,
type: 'result_event',
utm_medium: 'cpc',
utm_source:
'google%26utm_medium=cpc%26utm_campaignid=21380769590%26utm_adgroupid=%26utm_adid=%26utm_term=%26utm_device=m%26utm_network=%26utm_location=9062055%26gclid=%26gad_campaignid=21374496705%26gbraid=0AAAAAoV7mTM9mWFripzQ2Od0xXAfrW6p3%26wbraid=CmAKCQjwi4PHBhCUA',
},
};
const res = provider.transformEvent(rawEvent);
expect(res.id.length).toBeGreaterThan(30);
expect(res.imported_at).toMatch(
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
);
expect(omit(['id', 'imported_at'], res)).toEqual({
brand: 'Apple',
browser: 'GSA',
browser_version: 'null',
city: 'Bengaluru',
country: 'IN',
created_at: '2025-10-08T18:16:07.000Z',
device: 'mobile',
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
duration: 0,
latitude: null,
longitude: null,
model: 'iPhone',
name: 'ec_search_error',
origin: 'https://web.landeed.com',
os: 'iOS',
os_version: '18.7.0',
path: '/karnataka/ec-encumbrance-certificate',
profile_id: '3385916',
project_id: 'pid',
properties: {
__lib_version: '2.60.0',
'__query.gad_campaignid': '21374496705',
'__query.gbraid': '0AAAAAoV7mTM9mWFripzQ2Od0xXAfrW6p3',
'__query.utm_campaignid': '21380769590',
'__query.utm_device': 'm',
'__query.utm_location': '9062055',
'__query.utm_medium': 'cpc',
'__query.utm_source': 'google',
'__query.wbraid': 'CmAKCQjwi4PHBhCUA',
__screen: '393x852',
__source_insert_id: 'bclkaepeqcfuzt4v',
__userAgent:
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
binaryReadableVersion: 'NA',
binaryVersion: 'NA',
component: '/karnataka/ec-encumbrance-certificate',
errMsg: 'Request failed with status code 500',
errType: 'SERVER_ERROR',
isSilentSearch: 'false',
isTimeout: 'false',
jsVersion: '0.42.0',
language: 'english',
os: 'web',
osVersion:
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
phoneBrand: 'NA',
phoneManufacturer: 'NA',
phoneModel: 'NA',
searchUuid: '68e65d08-fd81-4ded-37d3-2b08d2bc70c3',
serverVersion: 'web2.0',
state: '17',
stateStr: '17',
statusCode: '500',
type: 'result_event',
},
referrer: 'https://www.google.com',
referrer_name: 'Google',
referrer_type: 'search',
region: 'Karnataka',
sdk_name: 'mixpanel (web)',
sdk_version: '1.0.0',
session_id: '',
});
});
});

View File

@@ -0,0 +1,452 @@
import { randomUUID } from 'node:crypto';
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server';
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
import type { IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { IMixpanelImportConfig } from '@openpanel/validation';
import { z } from 'zod';
import { BaseImportProvider } from '../base-provider';
export const zMixpanelRawEvent = z.object({
event: z.string(),
properties: z.record(z.unknown()),
});
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
provider = 'mixpanel';
version = '1.0.0';
constructor(
private readonly projectId: string,
private readonly config: IMixpanelImportConfig,
private readonly logger?: ILogger,
) {
super();
}
async getTotalEventsCount(): Promise<number> {
// Mixpanel sucks and dont provide a good way to extract total event count within a period
// jql would work but not accurate and will be deprecated end of 2025
return -1;
}
/**
* Mixpanel doesn't provide session IDs, so we need to generate them in SQL
* after all events are imported to ensure deterministic results
*/
shouldGenerateSessionIds(): boolean {
return true;
}
async *parseSource(
overrideFrom?: string,
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
yield* this.fetchEventsFromMixpanel(overrideFrom);
}
private async *fetchEventsFromMixpanel(
overrideFrom?: string,
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
// Split the date range into monthly chunks for reliability
// Uses base class utility to avoid timeout issues with large date ranges
const dateChunks = this.getDateChunks(overrideFrom ?? from, to); // 1 month per chunk
for (const [chunkFrom, chunkTo] of dateChunks) {
yield* this.fetchEventsForDateRange(
serviceAccount,
serviceSecret,
projectId,
chunkFrom,
chunkTo,
);
}
}
private async *fetchEventsForDateRange(
serviceAccount: string,
serviceSecret: string,
projectId: string,
from: string,
to: string,
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const url = 'https://data.mixpanel.com/api/2.0/export';
const params = new URLSearchParams({
from_date: from,
to_date: to,
project_id: projectId,
});
this.logger?.info('Fetching events from Mixpanel', {
url: `${url}?${params}`,
from,
to,
projectId,
serviceAccount,
});
const response = await fetch(`${url}?${params}`, {
method: 'GET',
headers: {
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
Accept: 'application/json',
},
});
if (!response.ok) {
throw new Error(
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`,
);
}
if (!response.body) {
throw new Error('No response body from Mixpanel API');
}
// Stream the response line by line
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
// Process complete lines
const lines = buffer.split('\n');
buffer = lines.pop() || ''; // Keep the last incomplete line in buffer
for (const line of lines) {
if (line.trim()) {
try {
const event = JSON.parse(line);
yield event;
} catch (error) {
console.warn('Failed to parse Mixpanel event:', line);
}
}
}
}
// Process any remaining line in buffer
if (buffer.trim()) {
try {
const event = JSON.parse(buffer);
yield event;
} catch (error) {
console.warn('Failed to parse final Mixpanel event:', buffer);
}
}
} finally {
reader.releaseLock();
}
}
validate(rawEvent: MixpanelRawEvent): boolean {
const res = zMixpanelRawEvent.safeParse(rawEvent);
return res.success;
}
transformEvent(_rawEvent: MixpanelRawEvent): IClickhouseEvent {
const projectId = this.projectId;
const rawEvent = zMixpanelRawEvent.parse(_rawEvent);
const props = rawEvent.properties as Record<string, any>;
const deviceId = props.$device_id;
const profileId = String(props.$user_id || props.distinct_id).replace(
/^\$device:/,
'',
);
// Build full URL from current_url and current_url_search (web only)
const fullUrl = props.$current_url;
let path = '';
let origin = '';
let hash = '';
let query: Record<string, string> = {};
if (fullUrl) {
const parsed = parsePath(fullUrl);
path = parsed.path || '';
origin = parsed.origin || '';
hash = parsed.hash || '';
query = parsed.query || {};
} else if (this.config.mapScreenViewProperty) {
path = props[this.config.mapScreenViewProperty] || '';
}
// Extract referrer information (web only)
const referrerUrl = props.$initial_referrer || props.$referrer || '';
const referrer =
referrerUrl && !isSameDomain(referrerUrl, fullUrl)
? parseReferrer(referrerUrl)
: null;
// Check for UTM referrer in query params (web only)
const utmReferrer = getReferrerWithQuery(query);
// Extract location data
const country = props.$country || props.mp_country_code || '';
const city = props.$city || '';
const region = props.$region || '';
// For web events, use the standard user agent parsing
const userAgent = props.osVersion || '';
const uaInfo = this.isWebEvent(props.mp_lib)
? parseUserAgent(userAgent, props)
: this.parseServerDeviceInfo(props);
// Map event name - $mp_web_page_view should be screen_view
let eventName = rawEvent.event;
if (eventName === '$mp_web_page_view') {
eventName = 'screen_view';
}
// Build properties object - strip Mixpanel-specific properties
const properties = this.stripMixpanelProperties(props, query);
if (props.$insert_id) {
properties.__source_insert_id = String(props.$insert_id);
}
// Add useful properties
if (props.$screen_width && props.$screen_height) {
properties.__screen = `${props.$screen_width}x${props.$screen_height}`;
}
if (props.$screen_dpi) {
properties.__dpi = props.$screen_dpi;
}
if (props.$language) {
properties.__language = props.$language;
}
if (props.$timezone) {
properties.__timezone = props.$timezone;
}
if (props.$app_version) {
properties.__version = props.$app_version;
}
if (props.$app_build_number) {
properties.__buildNumber = props.$app_build_number;
}
if (props.$lib_version) {
properties.__lib_version = props.$lib_version;
}
if (hash) {
properties.__hash = hash;
}
if (Object.keys(query).length > 0) {
properties.__query = query;
}
if (props.current_page_title) {
properties.__title = props.current_page_title;
}
if (userAgent) {
properties.__userAgent = userAgent;
}
// Always use UUID for id to match ClickHouse UUID column
const event = {
id: randomUUID(),
name: eventName,
device_id: deviceId,
profile_id: profileId,
project_id: projectId,
session_id: '', // Will be generated in SQL after import
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
created_at: new Date(props.time * 1000).toISOString(),
country,
city,
region,
longitude: null,
latitude: null,
os: uaInfo.os || props.$os,
os_version: uaInfo.osVersion || props.$osVersion,
browser: uaInfo.browser || props.$browser,
browser_version:
uaInfo.browserVersion || props.$browserVersion
? String(props.$browser_version)
: '',
device: this.getDeviceType(props.mp_lib, uaInfo, props),
brand: uaInfo.brand || '',
model: uaInfo.model || '',
duration: 0,
path,
origin,
referrer: referrer?.url || '',
referrer_name: utmReferrer?.name || referrer?.name || '',
referrer_type: referrer?.type || utmReferrer?.type || '',
imported_at: new Date().toISOString(),
sdk_name: props.mp_lib
? `${this.provider} (${props.mp_lib})`
: this.provider,
sdk_version: this.version,
};
// TODO: Remove this
// Temporary fix for a client
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
event.name = 'screen_view';
event.path = isMightBeScreenView;
}
// TODO: Remove this
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
if (props.utm_source && !properties.__query?.utm_source) {
const split = decodeURIComponent(props.utm_source).split('&');
const query = Object.fromEntries(split.map((item) => item.split('=')));
for (const [key, value] of Object.entries(query)) {
if (key && value) {
event.properties[`__query.${key}`] = String(value);
} else if (
value === undefined &&
key &&
props.utm_source.startsWith(key)
) {
event.properties['__query.utm_source'] = String(key);
}
}
}
return event;
}
private getDeviceType(
mp_lib: string,
uaInfo: UserAgentInfo,
props: Record<string, any>,
) {
// Normalize lib/os/browser data
const lib = (mp_lib || '').toLowerCase();
const os = String(props.$os || uaInfo.os || '').toLowerCase();
const browser = String(
props.$browser || uaInfo.browser || '',
).toLowerCase();
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
// Strong hint from SDK library
if (['android', 'iphone', 'react-native', 'swift', 'unity'].includes(lib)) {
return isTabletOs ? 'tablet' : 'mobile';
}
// Web or unknown SDKs: infer from OS/Browser
const isMobileSignal =
os === 'ios' ||
os === 'android' ||
browser.includes('mobile safari') ||
browser.includes('chrome ios') ||
browser.includes('android mobile') ||
browser.includes('samsung internet') ||
browser.includes('mobile');
if (isMobileSignal) {
return 'mobile';
}
const isTabletSignal =
isTabletOs ||
browser.includes('tablet') ||
// iPad often reports as Mac OS X with Mobile Safari
(browser.includes('mobile safari') &&
(os === 'mac os x' || os === 'macos'));
if (isTabletSignal) {
return 'tablet';
}
// Default to desktop
return this.isServerEvent(mp_lib) ? 'server' : 'desktop';
}
private isWebEvent(mp_lib: string) {
return [
'web',
'android',
'iphone',
'swift',
'unity',
'react-native',
].includes(mp_lib);
}
private isServerEvent(mp_lib: string) {
return !this.isWebEvent(mp_lib);
}
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
const props = rawEvent.properties as Record<string, any>;
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
}
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
// For mobile events, extract device information from Mixpanel properties
const os = props.$os || props.os || '';
const osVersion = props.$os_version || props.osVersion || '';
const brand = props.$brand || props.phoneBrand || '';
const model = props.$model || props.phoneModel || '';
const device = os.toLowerCase();
return {
isServer: true,
os: os,
osVersion: osVersion,
browser: '',
browserVersion: '',
device: device,
brand: brand,
model: model,
};
}
private stripMixpanelProperties(
properties: Record<string, any>,
searchParams: Record<string, string>,
): Record<string, any> {
const strip = [
'time',
'distinct_id',
'current_page_title',
'current_url_path',
'current_url_protocol',
'current_url_search',
'current_domain',
...Object.keys(searchParams),
];
const filtered = Object.fromEntries(
Object.entries(properties).filter(
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key),
),
);
// Parse JSON strings back to objects/arrays so toDots() can flatten them
const parsed: Record<string, any> = {};
for (const [key, value] of Object.entries(filtered)) {
if (
typeof value === 'string' &&
(value.startsWith('{') || value.startsWith('['))
) {
try {
parsed[key] = JSON.parse(value);
} catch {
parsed[key] = value; // Keep as string if parsing fails
}
} else {
parsed[key] = value;
}
}
return parsed;
}
}

View File

@@ -0,0 +1,382 @@
import { randomUUID } from 'node:crypto';
import { Readable } from 'node:stream';
import { pipeline } from 'node:stream/promises';
import { createBrotliDecompress, createGunzip } from 'node:zlib';
import { isSameDomain, parsePath } from '@openpanel/common';
import { generateDeviceId } from '@openpanel/common/server';
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
import type { IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { IUmamiImportConfig } from '@openpanel/validation';
import { parse } from 'csv-parse';
import { assocPath } from 'ramda';
import { z } from 'zod';
import { BaseImportProvider } from '../base-provider';
export const zUmamiRawEvent = z.object({
// Required fields
event_type: z.coerce.number(),
event_name: z.string(),
created_at: z.coerce.date(),
event_id: z.string().min(1),
session_id: z.string().min(1),
website_id: z.string().min(1),
// Optional fields that might be empty
visit_id: z.string().optional(),
distinct_id: z.string().optional(),
url_path: z.string().optional(),
hostname: z.string().optional(),
referrer_domain: z.string().optional(),
referrer_path: z.string().optional(),
referrer_query: z.string().optional(),
referrer_name: z.string().optional(),
referrer_type: z.string().optional(),
country: z.string().optional(),
city: z.string().optional(),
region: z.string().optional(),
browser: z.string().optional(),
os: z.string().optional(),
device: z.string().optional(),
screen: z.string().optional(),
language: z.string().optional(),
utm_source: z.string().optional(),
utm_medium: z.string().optional(),
utm_campaign: z.string().optional(),
utm_content: z.string().optional(),
utm_term: z.string().optional(),
page_title: z.string().optional(),
gclid: z.string().optional(),
fbclid: z.string().optional(),
msclkid: z.string().optional(),
ttclid: z.string().optional(),
li_fat_id: z.string().optional(),
twclid: z.string().optional(),
url_query: z.string().optional(),
});
export type UmamiRawEvent = z.infer<typeof zUmamiRawEvent>;
export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
provider = 'umami';
version = '1.0.0';
constructor(
private readonly projectId: string,
private readonly config: IUmamiImportConfig,
private readonly logger?: ILogger,
) {
super();
}
async getTotalEventsCount(): Promise<number> {
return -1;
}
async *parseSource(): AsyncGenerator<UmamiRawEvent, void, unknown> {
yield* this.parseRemoteFile(this.config.fileUrl);
}
private async *parseRemoteFile(
url: string,
opts: {
signal?: AbortSignal;
maxBytes?: number;
maxRows?: number;
} = {},
): AsyncGenerator<UmamiRawEvent, void, unknown> {
const { signal, maxBytes, maxRows } = opts;
const controller = new AbortController();
// Link to caller's signal for cancellation
if (signal) {
signal.addEventListener('abort', () => controller.abort(), {
once: true,
});
}
const res = await fetch(url, { signal: controller.signal });
if (!res.ok || !res.body) {
throw new Error(
`Failed to fetch remote file: ${res.status} ${res.statusText}`,
);
}
const contentType = res.headers.get('content-type') || '';
const contentEnc = res.headers.get('content-encoding') || '';
const contentLen = Number(res.headers.get('content-length') ?? 0);
if (
contentType &&
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
contentType,
)
) {
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
}
if (maxBytes && contentLen && contentLen > maxBytes) {
throw new Error(
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`,
);
}
const looksGzip =
/\.gz($|\?)/i.test(url) ||
/gzip/i.test(contentEnc) ||
/application\/gzip/i.test(contentType);
const looksBr = /br/i.test(contentEnc) || /\.br($|\?)/i.test(url);
// WHATWG -> Node stream
const body = Readable.fromWeb(res.body as any);
// Optional size guard during stream
let seenBytes = 0;
if (maxBytes) {
body.on('data', (chunk: Buffer) => {
seenBytes += chunk.length;
if (seenBytes > maxBytes) {
controller.abort();
body.destroy(
new Error(
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
),
);
}
});
}
// Build decode chain (gzip/brotli -> CSV parser)
const decompress = looksGzip
? createGunzip()
: looksBr
? createBrotliDecompress()
: null;
const parser = parse({
columns: true, // objects per row
bom: true, // handle UTF-8 BOM
relax_column_count: true,
skip_empty_lines: true,
});
// Wire the pipeline for proper backpressure & error propagation
(async () => {
try {
if (decompress) {
await pipeline(body, decompress, parser, {
signal: controller.signal,
});
} else {
await pipeline(body, parser, { signal: controller.signal });
}
} catch (e) {
parser.destroy(e as Error);
}
})().catch(() => {
/* handled by iterator */
});
let rows = 0;
try {
for await (const record of parser) {
rows++;
if (maxRows && rows > maxRows) {
controller.abort();
throw new Error(`Row limit exceeded (${rows} > ${maxRows})`);
}
yield record as UmamiRawEvent;
}
} catch (err) {
throw new Error(
`Failed to parse remote file from ${url}: ${
err instanceof Error ? err.message : String(err)
}`,
);
} finally {
controller.abort(); // ensure fetch stream is torn down
}
}
validate(rawEvent: UmamiRawEvent): boolean {
const res = zUmamiRawEvent.safeParse(rawEvent);
return res.success;
}
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
const projectId =
this.config.projectMapper.find(
(mapper) => mapper.from === _rawEvent.website_id,
)?.to || this.projectId;
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
// Extract device/profile ID - use visit_id as device_id, session_id for session tracking
const deviceId =
rawEvent.visit_id ||
generateDeviceId({
ip: rawEvent.visit_id!,
ua: rawEvent.visit_id!,
origin: projectId,
salt: 'xxx',
});
const profileId = rawEvent.distinct_id || deviceId;
// Parse URL if available - use same logic as real-time events
const url = rawEvent.url_path
? `https://${[rawEvent.hostname, rawEvent.url_path, rawEvent.url_query]
.filter(Boolean)
.join('')}`
: '';
const { path, hash, query, origin } = parsePath(url);
// Extract referrer information - use same logic as real-time events
const referrerUrl = rawEvent.referrer_domain
? `https://${rawEvent.referrer_domain}${rawEvent.referrer_path || ''}`
: '';
// Check if referrer is from same domain (like real-time events do)
const referrer = isSameDomain(referrerUrl, url)
? null
: parseReferrer(referrerUrl);
// Check for UTM referrer in query params (like real-time events do)
const utmReferrer = getReferrerWithQuery(query);
// Extract location data
const country = rawEvent.country || '';
const city = rawEvent.city || '';
const region = rawEvent.region || '';
// Extract browser/device info
const browser = rawEvent.browser || '';
const browserVersion = ''; // Not available in Umami CSV
const os = rawEvent.os || '';
const osVersion = ''; // Not available in Umami CSV
const device = rawEvent.device || '';
const brand = ''; // Not available in Umami CSV
const model = ''; // Not available in Umami CSV
let properties: Record<string, any> = {};
if (query) {
properties.__query = query;
}
// Add useful properties from Umami data
if (rawEvent.page_title) properties.__title = rawEvent.page_title;
if (rawEvent.screen) properties.__screen = rawEvent.screen;
if (rawEvent.language) properties.__language = rawEvent.language;
if (rawEvent.utm_source)
properties = assocPath(
['__query', 'utm_source'],
rawEvent.utm_source,
properties,
);
if (rawEvent.utm_medium)
properties = assocPath(
['__query', 'utm_medium'],
rawEvent.utm_medium,
properties,
);
if (rawEvent.utm_campaign)
properties = assocPath(
['__query', 'utm_campaign'],
rawEvent.utm_campaign,
properties,
);
if (rawEvent.utm_content)
properties = assocPath(
['__query', 'utm_content'],
rawEvent.utm_content,
properties,
);
if (rawEvent.utm_term)
properties = assocPath(
['__query', 'utm_term'],
rawEvent.utm_term,
properties,
);
return {
id: rawEvent.event_id || randomUUID(),
name: rawEvent.event_type === 1 ? 'screen_view' : rawEvent.event_name,
device_id: deviceId,
profile_id: profileId,
project_id: projectId,
session_id: rawEvent.session_id || '',
properties,
created_at: rawEvent.created_at.toISOString(),
country,
city,
region: this.mapRegion(region),
longitude: null,
latitude: null,
os,
os_version: osVersion,
browser: this.mapBrowser(browser),
browser_version: browserVersion,
device: this.mapDevice(device),
brand,
model,
duration: 0,
path,
origin,
referrer: utmReferrer?.url || referrer?.url || '',
referrer_name: utmReferrer?.name || referrer?.name || '',
referrer_type: utmReferrer?.type || referrer?.type || '',
imported_at: new Date().toISOString(),
sdk_name: this.provider,
sdk_version: this.version,
};
}
mapRegion(region: string): string {
return region.replace(/^[A-Z]{2}\-/, '');
}
mapDevice(device: string): string {
const mapping: Record<string, string> = {
desktop: 'desktop',
laptop: 'desktop',
mobile: 'mobile',
tablet: 'tablet',
smarttv: 'smarttv',
Unknown: 'desktop',
};
return mapping[device] || 'desktop';
}
mapBrowser(browser: string): string {
const mapping: Record<string, string> = {
android: 'Android',
aol: 'AOL',
bb10: 'BlackBerry 10',
beaker: 'Beaker',
chrome: 'Chrome',
'chromium-webview': 'Chrome (webview)',
crios: 'Chrome (iOS)',
curl: 'Curl',
edge: 'Edge',
'edge-chromium': 'Edge (Chromium)',
'edge-ios': 'Edge (iOS)',
facebook: 'Facebook',
firefox: 'Firefox',
fxios: 'Firefox (iOS)',
ie: 'IE',
instagram: 'Instagram',
ios: 'iOS',
'ios-webview': 'iOS (webview)',
kakaotalk: 'KakaoTalk',
miui: 'MIUI',
opera: 'Opera',
'opera-mini': 'Opera Mini',
phantomjs: 'PhantomJS',
safari: 'Safari',
samsung: 'Samsung',
searchbot: 'Searchbot',
silk: 'Silk',
yandexbrowser: 'Yandex',
};
return mapping[browser] || browser || 'Unknown';
}
}

View File

@@ -0,0 +1,80 @@
import type {
IImportedEvent,
IServiceCreateEventPayload,
IServiceImportedEventPayload,
} from '@openpanel/db';
export interface ImportConfig {
projectId: string;
provider: string;
sourceType: 'file' | 'api';
sourceLocation: string;
}
export interface SessionInfo {
id: string;
lastTimestamp: number;
lastEvent: IServiceImportedEventPayload;
}
export interface ImportProgress {
totalEvents: number;
processedEvents: number;
currentBatch: number;
totalBatches: number;
}
export interface ImportResult {
success: boolean;
totalEvents: number;
processedEvents: number;
error?: string;
}
export interface BatchResult {
events: IServiceImportedEventPayload[];
sessionEvents: IServiceImportedEventPayload[];
}
// Generic types for raw events from different providers
export interface BaseRawEvent {
[key: string]: unknown;
}
// Error context for better error handling
export interface ErrorContext {
batchNumber?: number;
batchSize?: number;
eventIndex?: number;
rawEvent?: BaseRawEvent;
provider?: string;
}
// Properties type for events - more specific than Record<string, any>
export interface EventProperties {
[key: string]:
| string
| number
| boolean
| null
| undefined
| Record<string, unknown>;
__query?: Record<string, unknown>;
__title?: string;
__screen?: string;
__language?: string;
}
// Import job metadata for tracking import progress
export interface ImportJobMetadata {
importId: string;
importStatus: 'pending' | 'processing' | 'processed' | 'failed';
importedAt: Date;
}
// Result of import staging operations
export interface ImportStageResult {
importId: string;
totalEvents: number;
insertedEvents: number;
}