feat: new importer (#214)

This commit is contained in:
Carl-Gerhard Lindesvärd
2025-11-05 09:49:36 +01:00
committed by GitHub
parent b51bc8f3f6
commit 212254d31a
80 changed files with 4884 additions and 842 deletions

View File

@@ -1,33 +0,0 @@
{
"name": "@openpanel/cli",
"version": "0.0.1-beta",
"type": "module",
"module": "index.ts",
"bin": {
"openpanel": "dist/bin/cli.js"
},
"scripts": {
"build": "rm -rf dist && tsup",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"@openpanel/common": "workspace:*",
"arg": "^5.0.2",
"glob": "^10.4.3",
"inquirer": "^9.3.5",
"p-limit": "^6.1.0",
"progress": "^2.0.3",
"ramda": "^0.29.1",
"zod": "catalog:"
},
"devDependencies": {
"@openpanel/db": "workspace:^",
"@openpanel/sdk": "workspace:*",
"@openpanel/tsconfig": "workspace:*",
"@types/node": "catalog:",
"@types/progress": "^2.0.7",
"@types/ramda": "^0.30.1",
"tsup": "^7.2.0",
"typescript": "catalog:"
}
}

View File

@@ -1,24 +0,0 @@
import arg from 'arg';
import importer from './importer';
function cli() {
const args = arg(
{
'--help': Boolean,
},
{
permissive: true,
},
);
const [command] = args._;
switch (command) {
case 'import': {
return importer();
}
}
}
cli();

View File

@@ -1,467 +0,0 @@
import { randomUUID } from 'node:crypto';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import readline from 'node:readline';
import zlib from 'node:zlib';
import Progress from 'progress';
import { assocPath, prop, uniqBy } from 'ramda';
import { isSameDomain, parsePath } from '@openpanel/common';
import type { IImportedEvent } from '@openpanel/db';
const BATCH_SIZE = 30_000;
const SLEEP_TIME = 20;
const MAX_CONCURRENT_REQUESTS = 8;
type IMixpanelEvent = {
event: string;
properties: {
[key: string]: unknown;
time: number;
$current_url?: string;
distinct_id?: string;
$device_id?: string;
country_code?: string;
$region?: string;
$city?: string;
$os?: string;
$browser?: string;
$browser_version?: string;
$initial_referrer?: string;
$search_engine?: string;
};
};
function stripMixpanelProperties(obj: Record<string, unknown>) {
return Object.fromEntries(
Object.entries(obj).filter(
([key]) =>
!key.match(/^(\$|mp_)/) && !['time', 'distinct_id'].includes(key),
),
);
}
async function* parseJsonStream(
fileStream: fs.ReadStream,
): AsyncGenerator<any, void, unknown> {
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Number.POSITIVE_INFINITY,
});
let buffer = '';
let bracketCount = 0;
for await (const line of rl) {
buffer += line;
bracketCount +=
(line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
if (bracketCount === 0 && buffer.trim()) {
try {
const json = JSON.parse(buffer);
yield json;
} catch (error) {
console.log('Warning: Failed to parse JSON');
console.log('Buffer:', buffer);
}
buffer = '';
}
}
if (buffer.trim()) {
try {
const json = JSON.parse(buffer);
yield json;
} catch (error) {
console.log('Warning: Failed to parse remaining JSON');
console.log('Buffer:', buffer);
}
}
}
interface Session {
start: number;
end: number;
profileId?: string;
deviceId?: string;
sessionId: string;
firstEvent?: IImportedEvent;
lastEvent?: IImportedEvent;
events: IImportedEvent[];
}
function generateSessionEvents(events: IImportedEvent[]): Session[] {
let sessionList: Session[] = [];
const lastSessionByDevice: Record<string, Session> = {};
const lastSessionByProfile: Record<string, Session> = {};
const thirtyMinutes = 30 * 60 * 1000;
events.sort(
(a, b) =>
new Date(a.created_at).getTime() - new Date(b.created_at).getTime(),
);
for (const event of events) {
const eventTime = new Date(event.created_at).getTime();
let deviceSession = event.device_id
? lastSessionByDevice[event.device_id]
: undefined;
let profileSession = event.profile_id
? lastSessionByProfile[event.profile_id]
: undefined;
if (
event.device_id &&
event.device_id !== event.profile_id &&
(!deviceSession || eventTime > deviceSession.end + thirtyMinutes)
) {
deviceSession = {
start: eventTime,
end: eventTime,
deviceId: event.device_id,
sessionId: randomUUID(),
firstEvent: event,
events: [event],
};
lastSessionByDevice[event.device_id] = deviceSession;
sessionList.push(deviceSession);
} else if (deviceSession) {
deviceSession.end = eventTime;
deviceSession.lastEvent = event;
deviceSession.events.push(event);
}
if (
event.profile_id &&
event.device_id !== event.profile_id &&
(!profileSession || eventTime > profileSession.end + thirtyMinutes)
) {
profileSession = {
start: eventTime,
end: eventTime,
profileId: event.profile_id,
sessionId: randomUUID(),
firstEvent: event,
events: [event],
};
lastSessionByProfile[event.profile_id] = profileSession;
sessionList.push(profileSession);
} else if (profileSession) {
profileSession.end = eventTime;
profileSession.lastEvent = event;
profileSession.events.push(event);
}
if (
deviceSession &&
profileSession &&
deviceSession.sessionId !== profileSession.sessionId
) {
const unifiedSession = {
...deviceSession,
...profileSession,
events: [...deviceSession.events, ...profileSession.events],
start: Math.min(deviceSession.start, profileSession.start),
end: Math.max(deviceSession.end, profileSession.end),
sessionId: deviceSession.sessionId,
};
lastSessionByDevice[event.device_id] = unifiedSession;
lastSessionByProfile[event.profile_id] = unifiedSession;
sessionList = sessionList.filter(
(session) =>
session.sessionId !== deviceSession?.sessionId &&
session.sessionId !== profileSession?.sessionId,
);
sessionList.push(unifiedSession);
}
}
return sessionList;
}
function createEventObject(event: IMixpanelEvent): IImportedEvent {
const getReferrer = (referrer: string | undefined) => {
if (!referrer) {
return '';
}
if (referrer === '$direct') {
return '';
}
if (isSameDomain(referrer, event.properties.$current_url)) {
return '';
}
return referrer;
};
const url = parsePath(event.properties.$current_url);
return {
profile_id: event.properties.distinct_id
? String(event.properties.distinct_id).replace(/^\$device:/, '')
: (event.properties.$device_id ?? ''),
name: event.event,
created_at: new Date(event.properties.time * 1000).toISOString(),
properties: {
...stripMixpanelProperties(event.properties),
...(event.properties.$current_url
? {
__query: url.query,
__hash: url.hash,
}
: {}),
},
country: event.properties.country_code ?? '',
region: event.properties.$region ?? '',
city: event.properties.$city ?? '',
os: event.properties.$os ?? '',
browser: event.properties.$browser ?? '',
browser_version: event.properties.$browser_version
? String(event.properties.$browser_version)
: '',
referrer: getReferrer(event.properties.$initial_referrer),
referrer_type: event.properties.$search_engine ? 'search' : '',
referrer_name: event.properties.$search_engine ?? '',
device_id: event.properties.$device_id ?? '',
session_id: '',
project_id: '',
path: url.path,
origin: url.origin,
os_version: '',
model: '',
longitude: null,
latitude: null,
id: randomUUID(),
duration: 0,
device: event.properties.$current_url ? '' : 'server',
brand: '',
sdk_name: '',
sdk_version: '',
};
}
function isMixpanelEvent(event: any): event is IMixpanelEvent {
return (
typeof event === 'object' &&
event !== null &&
typeof event?.event === 'string' &&
typeof event?.properties === 'object' &&
event?.properties !== null &&
typeof event?.properties.time === 'number'
);
}
async function processFile(file: string): Promise<IImportedEvent[]> {
const fileStream = fs.createReadStream(file);
const events: IImportedEvent[] = [];
for await (const event of parseJsonStream(fileStream)) {
if (Array.isArray(event)) {
for (const item of event) {
if (isMixpanelEvent(item)) {
events.push(createEventObject(item));
} else {
console.log('Not a Mixpanel event', item);
}
}
} else {
if (isMixpanelEvent(event)) {
events.push(createEventObject(event));
} else {
console.log('Not a Mixpanel event', event);
}
}
}
return events;
}
function processEvents(events: IImportedEvent[]): IImportedEvent[] {
const sessions = generateSessionEvents(events);
const processedEvents = sessions.flatMap((session) =>
[
session.firstEvent && {
...session.firstEvent,
id: randomUUID(),
created_at: new Date(
new Date(session.firstEvent.created_at).getTime() - 1000,
).toISOString(),
session_id: session.sessionId,
name: 'session_start',
},
...uniqBy(
prop('id'),
session.events.map((event) =>
assocPath(['session_id'], session.sessionId, event),
),
),
session.lastEvent && {
...session.lastEvent,
id: randomUUID(),
created_at: new Date(
new Date(session.lastEvent.created_at).getTime() + 1000,
).toISOString(),
session_id: session.sessionId,
name: 'session_end',
},
].filter((item): item is IImportedEvent => !!item),
);
return [
...processedEvents,
...events.filter((event) => {
return !event.profile_id && !event.device_id;
}),
];
}
async function sendBatchToAPI(
batch: IImportedEvent[],
{
apiUrl,
clientId,
clientSecret,
}: {
apiUrl: string;
clientId: string;
clientSecret: string;
},
) {
async function request() {
const res = await fetch(`${apiUrl}/import/events`, {
method: 'POST',
headers: {
'Content-Encoding': 'gzip',
'Content-Type': 'application/json',
'openpanel-client-id': clientId,
'openpanel-client-secret': clientSecret,
},
body: Buffer.from(zlib.gzipSync(JSON.stringify(batch))),
});
if (!res.ok) {
throw new Error(`Failed to send batch: ${await res.text()}`);
}
await new Promise((resolve) => setTimeout(resolve, SLEEP_TIME));
}
try {
await request();
} catch (e) {
console.log('Error sending batch, retrying...');
await new Promise((resolve) => setTimeout(resolve, 1000));
try {
await request();
} catch (e) {
console.log('Error sending batch, skipping...');
fs.writeFileSync(
path.join(
os.tmpdir(),
`openpanel/failed-import-batch-${batch[0]?.created_at ? new Date(batch[0]?.created_at).toISOString() : Date.now()}.json`,
),
JSON.stringify(batch, null, 2),
);
}
}
}
async function processFiles({
files,
apiUrl,
clientId,
clientSecret,
}: {
files: string[];
apiUrl: string;
clientId: string;
clientSecret: string;
}) {
const progress = new Progress(
'Processing (:current/:total) :file [:bar] :percent | :savedEvents saved events | :status',
{
total: files.length,
width: 20,
},
);
let savedEvents = 0;
let currentBatch: IImportedEvent[] = [];
let apiBatching = [];
for (const file of files) {
progress.tick({
file,
savedEvents,
status: 'reading file',
});
const events = await processFile(file);
progress.render({
file,
savedEvents,
status: 'processing events',
});
const processedEvents = processEvents(events);
for (const event of processedEvents) {
currentBatch.push(event);
if (currentBatch.length >= BATCH_SIZE) {
apiBatching.push(currentBatch);
savedEvents += currentBatch.length;
progress.render({ file, savedEvents, status: 'saving events' });
currentBatch = [];
}
if (apiBatching.length >= MAX_CONCURRENT_REQUESTS) {
await Promise.all(
apiBatching.map((batch) =>
sendBatchToAPI(batch, {
apiUrl,
clientId,
clientSecret,
}),
),
);
apiBatching = [];
}
}
}
if (currentBatch.length > 0) {
await sendBatchToAPI(currentBatch, {
apiUrl,
clientId,
clientSecret,
});
savedEvents += currentBatch.length;
progress.render({ file: 'Complete', savedEvents, status: 'Complete' });
}
}
export async function importFiles({
files,
apiUrl,
clientId,
clientSecret,
}: {
files: string[];
apiUrl: string;
clientId: string;
clientSecret: string;
}) {
if (files.length === 0) {
console.log('No files found');
return;
}
console.log(`Found ${files.length} files to process`);
const startTime = Date.now();
await processFiles({
files,
apiUrl,
clientId,
clientSecret,
});
const endTime = Date.now();
console.log(
`\nProcessing completed in ${(endTime - startTime) / 1000} seconds`,
);
}

View File

@@ -1,59 +0,0 @@
import path from 'node:path';
import arg from 'arg';
import { glob } from 'glob';
import { importFiles } from './importer';
export default async function importer() {
const args = arg(
{
'--glob': String,
'--api-url': String,
'--client-id': String,
'--client-secret': String,
'--dry-run': Boolean,
'--from': Number,
'--to': Number,
},
{
permissive: true,
},
);
if (!args['--glob']) {
throw new Error('Missing --glob argument');
}
if (!args['--client-id']) {
throw new Error('Missing --client-id argument');
}
if (!args['--client-secret']) {
throw new Error('Missing --client-secret argument');
}
const cwd = process.cwd();
const fileMatcher = path.resolve(cwd, args['--glob']);
const allFiles = await glob([fileMatcher], { root: '/' });
allFiles.sort((a, b) => a.localeCompare(b));
const files = allFiles.slice(
args['--from'] ?? 0,
args['--to'] ?? Number.MAX_SAFE_INTEGER,
);
if (args['--dry-run']) {
files.forEach((file, index) => {
console.log(`Would import (index: ${index}): ${file}`);
});
return;
}
return importFiles({
files,
clientId: args['--client-id'],
clientSecret: args['--client-secret'],
apiUrl: args['--api-url'] ?? 'https://api.openpanel.dev',
});
}

View File

@@ -1,8 +0,0 @@
{
"extends": "@openpanel/tsconfig/base.json",
"compilerOptions": {
"incremental": false,
"outDir": "dist"
},
"exclude": ["dist"]
}

View File

@@ -1,11 +0,0 @@
import { defineConfig } from 'tsup';
export default defineConfig({
entry: ['src/cli.ts'],
format: ['cjs', 'esm'],
dts: true,
splitting: false,
sourcemap: false,
clean: true,
minify: true,
});

View File

@@ -3,9 +3,14 @@
"version": "0.0.1",
"type": "module",
"main": "index.ts",
"exports": {
".": "./index.ts",
"./server": "./server/index.ts"
},
"scripts": {
"test": "vitest",
"typecheck": "tsc --noEmit"
"typecheck": "tsc --noEmit",
"gen:referrers": "jiti scripts/get-referrers.ts && biome format --write ./server/referrers/index.ts"
},
"dependencies": {
"@openpanel/constants": "workspace:*",

View File

@@ -0,0 +1,96 @@
import fs from 'node:fs';
import path from 'node:path';
import { dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// extras
const extraReferrers = {
'zoom.us': { type: 'social', name: 'Zoom' },
'apple.com': { type: 'tech', name: 'Apple' },
'adobe.com': { type: 'tech', name: 'Adobe' },
'figma.com': { type: 'tech', name: 'Figma' },
'wix.com': { type: 'commerce', name: 'Wix' },
'gmail.com': { type: 'email', name: 'Gmail' },
'notion.so': { type: 'tech', name: 'Notion' },
'ebay.com': { type: 'commerce', name: 'eBay' },
'github.com': { type: 'tech', name: 'GitHub' },
'gitlab.com': { type: 'tech', name: 'GitLab' },
'slack.com': { type: 'social', name: 'Slack' },
'etsy.com': { type: 'commerce', name: 'Etsy' },
'bsky.app': { type: 'social', name: 'Bluesky' },
'twitch.tv': { type: 'content', name: 'Twitch' },
'dropbox.com': { type: 'tech', name: 'Dropbox' },
'outlook.com': { type: 'email', name: 'Outlook' },
'medium.com': { type: 'content', name: 'Medium' },
'paypal.com': { type: 'commerce', name: 'PayPal' },
'discord.com': { type: 'social', name: 'Discord' },
'stripe.com': { type: 'commerce', name: 'Stripe' },
'spotify.com': { type: 'content', name: 'Spotify' },
'netflix.com': { type: 'content', name: 'Netflix' },
'whatsapp.com': { type: 'social', name: 'WhatsApp' },
'shopify.com': { type: 'commerce', name: 'Shopify' },
'microsoft.com': { type: 'tech', name: 'Microsoft' },
'alibaba.com': { type: 'commerce', name: 'Alibaba' },
'telegram.org': { type: 'social', name: 'Telegram' },
'substack.com': { type: 'content', name: 'Substack' },
'salesforce.com': { type: 'tech', name: 'Salesforce' },
'instagram.com': { type: 'social', name: 'Instagram' },
'wikipedia.org': { type: 'content', name: 'Wikipedia' },
'mastodon.social': { type: 'social', name: 'Mastodon' },
'office.com': { type: 'tech', name: 'Microsoft Office' },
'squarespace.com': { type: 'commerce', name: 'Squarespace' },
'stackoverflow.com': { type: 'tech', name: 'Stack Overflow' },
'teams.microsoft.com': { type: 'social', name: 'Microsoft Teams' },
};
function transform(data: any) {
const obj: Record<string, unknown> = {};
for (const type in data) {
for (const name in data[type]) {
const domains = data[type][name].domains ?? [];
for (const domain of domains) {
obj[domain] = {
type,
name,
};
}
}
}
return obj;
}
async function main() {
// Get document, or throw exception on error
try {
const data = await fetch(
'https://s3-eu-west-1.amazonaws.com/snowplow-hosted-assets/third-party/referer-parser/referers-latest.json',
).then((res) => res.json());
fs.writeFileSync(
path.resolve(__dirname, '../../worker/src/referrers/index.ts'),
[
'// This file is generated by the script get-referrers.ts',
'',
'// The data is fetch from snowplow-referer-parser https://github.com/snowplow-referer-parser/referer-parser',
`// The orginal referers.yml is based on Piwik's SearchEngines.php and Socials.php, copyright 2012 Matthieu Aubry and available under the GNU General Public License v3.`,
'',
`const referrers: Record<string, { type: string, name: string }> = ${JSON.stringify(
{
...transform(data),
...extraReferrers,
},
)} as const;`,
'export default referrers;',
].join('\n'),
'utf-8',
);
} catch (e) {
console.log(e);
}
}
main();

View File

@@ -1,3 +1,5 @@
export * from './crypto';
export * from './profileId';
export * from './parser-user-agent';
export * from './parse-referrer';
export * from './id';

View File

@@ -0,0 +1,117 @@
import { describe, expect, it } from 'vitest';
import { getReferrerWithQuery, parseReferrer } from './parse-referrer';
describe('parseReferrer', () => {
it('should handle undefined or empty URLs', () => {
expect(parseReferrer(undefined)).toEqual({
name: '',
type: '',
url: '',
});
expect(parseReferrer('')).toEqual({
name: '',
type: '',
url: '',
});
});
it('should parse valid referrer URLs', () => {
expect(parseReferrer('https://google.com/search?q=test')).toEqual({
name: 'Google',
type: 'search',
url: 'https://google.com/search?q=test',
});
});
it('should handle www prefix in hostnames', () => {
expect(parseReferrer('https://www.twitter.com/user')).toEqual({
name: 'Twitter',
type: 'social',
url: 'https://www.twitter.com/user',
});
expect(parseReferrer('https://twitter.com/user')).toEqual({
name: 'Twitter',
type: 'social',
url: 'https://twitter.com/user',
});
});
it('should handle unknown referrers', () => {
expect(parseReferrer('https://unknown-site.com')).toEqual({
name: '',
type: '',
url: 'https://unknown-site.com',
});
});
it('should handle invalid URLs', () => {
expect(parseReferrer('not-a-url')).toEqual({
name: '',
type: '',
url: 'not-a-url',
});
});
});
describe('getReferrerWithQuery', () => {
it('should handle undefined or empty query', () => {
expect(getReferrerWithQuery(undefined)).toBeNull();
expect(getReferrerWithQuery({})).toBeNull();
});
it('should parse utm_source parameter', () => {
expect(getReferrerWithQuery({ utm_source: 'google' })).toEqual({
name: 'Google',
type: 'search',
url: '',
});
});
it('should parse ref parameter', () => {
expect(getReferrerWithQuery({ ref: 'facebook' })).toEqual({
name: 'Facebook',
type: 'social',
url: '',
});
});
it('should parse utm_referrer parameter', () => {
expect(getReferrerWithQuery({ utm_referrer: 'twitter' })).toEqual({
name: 'Twitter',
type: 'social',
url: '',
});
});
it('should handle case-insensitive matching', () => {
expect(getReferrerWithQuery({ utm_source: 'GoOgLe' })).toEqual({
name: 'Google',
type: 'search',
url: '',
});
});
it('should handle unknown sources', () => {
expect(getReferrerWithQuery({ utm_source: 'unknown-source' })).toEqual({
name: 'unknown-source',
type: '',
url: '',
});
});
it('should prioritize utm_source over ref and utm_referrer', () => {
expect(
getReferrerWithQuery({
utm_source: 'google',
ref: 'facebook',
utm_referrer: 'twitter',
}),
).toEqual({
name: 'Google',
type: 'search',
url: '',
});
});
});

View File

@@ -0,0 +1,66 @@
import { stripTrailingSlash } from '../src/string';
import referrers from './referrers';
function getHostname(url: string | undefined) {
if (!url) {
return '';
}
try {
return new URL(url).hostname;
} catch (e) {
return '';
}
}
export function parseReferrer(url: string | undefined) {
const hostname = getHostname(url);
const match = referrers[hostname] ?? referrers[hostname.replace('www.', '')];
return {
name: match?.name ?? '',
type: match?.type ?? '',
url: stripTrailingSlash(url ?? ''),
};
}
export function getReferrerWithQuery(
query: Record<string, string> | undefined,
) {
if (!query) {
return null;
}
const source = (
query.utm_source ??
query.ref ??
query.utm_referrer ??
''
).toLowerCase();
if (source === '') {
return null;
}
const match =
referrers[source] ||
referrers[`${source}.com`] ||
Object.values(referrers).find(
(referrer) => referrer.name.toLowerCase() === source,
);
if (match) {
return {
name: match.name,
type: match.type,
url: '',
};
}
return {
name: source,
type: '',
url: '',
};
}

View File

@@ -68,6 +68,7 @@ const parse = (ua: string): UAParser.IResult => {
return res;
};
export type UserAgentInfo = ReturnType<typeof parseUserAgent>;
export function parseUserAgent(
ua?: string | null,
overrides?: Record<string, unknown>,
@@ -80,13 +81,35 @@ export function parseUserAgent(
}
return {
os: overrides?.__os || res.os.name,
osVersion: overrides?.__osVersion || res.os.version,
browser: overrides?.__browser || res.browser.name,
browserVersion: overrides?.__browserVersion || res.browser.version,
device: overrides?.__device || res.device.type || getDevice(ua),
brand: overrides?.__brand || res.device.vendor,
model: overrides?.__model || res.device.model,
os:
typeof overrides?.__os === 'string' && overrides?.__os
? overrides?.__os
: res.os.name,
osVersion:
typeof overrides?.__osVersion === 'string' && overrides?.__osVersion
? overrides?.__osVersion
: res.os.version,
browser:
typeof overrides?.__browser === 'string' && overrides?.__browser
? overrides?.__browser
: res.browser.name,
browserVersion:
typeof overrides?.__browserVersion === 'string' &&
overrides?.__browserVersion
? overrides?.__browserVersion
: res.browser.version,
device:
typeof overrides?.__device === 'string' && overrides?.__device
? overrides?.__device
: res.device.type || getDevice(ua),
brand:
typeof overrides?.__brand === 'string' && overrides?.__brand
? overrides?.__brand
: res.device.vendor,
model:
typeof overrides?.__model === 'string' && overrides?.__model
? overrides?.__model
: res.device.model,
isServer: false,
} as const;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
# Snowplow Referer Parser
The file index.ts in this dir is generated from snowplows referer database [Snowplow Referer Parser](https://github.com/snowplow-referer-parser/referer-parser).
The orginal [referers.yml](https://github.com/snowplow-referer-parser/referer-parser/blob/master/resources/referers.yml) is based on Piwik's SearchEngines.php and Socials.php, copyright 2012 Matthieu Aubry and available under the GNU General Public License v3.

View File

@@ -0,0 +1,48 @@
import { describe, expect, it } from 'vitest';
import { toDots } from './object';
describe('toDots', () => {
it('should convert an object to a dot object', () => {
const obj = {
a: 1,
b: 2,
array: ['1', '2', '3'],
arrayWithObjects: [{ a: 1 }, { b: 2 }, { c: 3 }],
objectWithArrays: { a: [1, 2, 3] },
null: null,
undefined: undefined,
empty: '',
jsonString: '{"a": 1, "b": 2}',
};
expect(toDots(obj)).toEqual({
a: '1',
b: '2',
'array.0': '1',
'array.1': '2',
'array.2': '3',
'arrayWithObjects.0.a': '1',
'arrayWithObjects.1.b': '2',
'arrayWithObjects.2.c': '3',
'objectWithArrays.a.0': '1',
'objectWithArrays.a.1': '2',
'objectWithArrays.a.2': '3',
'jsonString.a': '1',
'jsonString.b': '2',
});
});
it('should handle malformed JSON strings gracefully', () => {
const obj = {
validJson: '{"key":"value"}',
malformedJson: '{"key":"unterminated string',
startsWithBrace: '{not json at all',
startsWithBracket: '[also not json',
regularString: 'normal string',
};
expect(toDots(obj)).toEqual({
'validJson.key': 'value',
regularString: 'normal string',
});
});
});

View File

@@ -1,5 +1,18 @@
import { anyPass, assocPath, isEmpty, isNil, reject } from 'ramda';
function isValidJsonString(value: string): boolean {
return (
(value.startsWith('{') && value.endsWith('}')) ||
(value.startsWith('[') && value.endsWith(']'))
);
}
function isMalformedJsonString(value: string): boolean {
return (
(value.startsWith('{') && !value.endsWith('}')) ||
(value.startsWith('[') && !value.endsWith(']'))
);
}
export function toDots(
obj: Record<string, unknown>,
path = '',
@@ -19,10 +32,28 @@ export function toDots(
};
}
if (value === undefined || value === null) {
if (value === undefined || value === null || value === '') {
return acc;
}
if (typeof value === 'string' && isMalformedJsonString(value)) {
// Skip it
return acc;
}
// Fix nested json strings - but catch parse errors for malformed JSON
if (typeof value === 'string' && isValidJsonString(value)) {
try {
return {
...acc,
...toDots(JSON.parse(value), `${path}${key}.`),
};
} catch {
// Skip it
return acc;
}
}
const cleanedValue =
typeof value === 'string'
? removeInvalidSurrogates(value).trim()

View File

@@ -0,0 +1,47 @@
CREATE TABLE IF NOT EXISTS events_imports_replicated ON CLUSTER '{cluster}' (
`id` UUID DEFAULT generateUUIDv4(),
`name` LowCardinality(String),
`sdk_name` LowCardinality(String),
`sdk_version` LowCardinality(String),
`device_id` String CODEC(ZSTD(3)),
`profile_id` String CODEC(ZSTD(3)),
`project_id` String CODEC(ZSTD(3)),
`session_id` String CODEC(LZ4),
`path` String CODEC(ZSTD(3)),
`origin` String CODEC(ZSTD(3)),
`referrer` String CODEC(ZSTD(3)),
`referrer_name` String CODEC(ZSTD(3)),
`referrer_type` LowCardinality(String),
`duration` UInt64 CODEC(Delta(4), LZ4),
`properties` Map(String, String) CODEC(ZSTD(3)),
`created_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3)),
`country` LowCardinality(FixedString(2)),
`city` String,
`region` LowCardinality(String),
`longitude` Nullable(Float32) CODEC(Gorilla, LZ4),
`latitude` Nullable(Float32) CODEC(Gorilla, LZ4),
`os` LowCardinality(String),
`os_version` LowCardinality(String),
`browser` LowCardinality(String),
`browser_version` LowCardinality(String),
`device` LowCardinality(String),
`brand` LowCardinality(String),
`model` LowCardinality(String),
`imported_at` Nullable(DateTime) CODEC(Delta(4), LZ4),
`import_id` String CODEC(ZSTD(3)),
`import_status` LowCardinality(String) DEFAULT 'pending',
`imported_at_meta` DateTime DEFAULT now()
)
ENGINE = ReplicatedMergeTree('/clickhouse/{installation}/{cluster}/tables/{shard}/openpanel/v1/{table}', '{replica}')
PARTITION BY toYYYYMM(imported_at_meta)
ORDER BY (import_id, created_at)
SETTINGS index_granularity = 8192;
---
CREATE TABLE IF NOT EXISTS events_imports ON CLUSTER '{cluster}' AS events_imports_replicated
ENGINE = Distributed('{cluster}', currentDatabase(), events_imports_replicated, cityHash64(import_id));
---
ALTER TABLE events_imports_replicated ON CLUSTER '{cluster}' MODIFY TTL imported_at_meta + INTERVAL 7 DAY;

View File

@@ -0,0 +1,90 @@
import fs from 'node:fs';
import path from 'node:path';
import { TABLE_NAMES } from '../src/clickhouse/client';
import {
createTable,
modifyTTL,
runClickhouseMigrationCommands,
} from '../src/clickhouse/migration';
import { getIsCluster } from './helpers';
export async function up() {
const isClustered = getIsCluster();
const sqls: string[] = [
...createTable({
name: 'events_imports',
columns: [
// Same columns as events table
'`id` UUID DEFAULT generateUUIDv4()',
'`name` LowCardinality(String)',
'`sdk_name` LowCardinality(String)',
'`sdk_version` LowCardinality(String)',
'`device_id` String CODEC(ZSTD(3))',
'`profile_id` String CODEC(ZSTD(3))',
'`project_id` String CODEC(ZSTD(3))',
'`session_id` String CODEC(LZ4)',
'`path` String CODEC(ZSTD(3))',
'`origin` String CODEC(ZSTD(3))',
'`referrer` String CODEC(ZSTD(3))',
'`referrer_name` String CODEC(ZSTD(3))',
'`referrer_type` LowCardinality(String)',
'`duration` UInt64 CODEC(Delta(4), LZ4)',
'`properties` Map(String, String) CODEC(ZSTD(3))',
'`created_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3))',
'`country` LowCardinality(FixedString(2))',
'`city` String',
'`region` LowCardinality(String)',
'`longitude` Nullable(Float32) CODEC(Gorilla, LZ4)',
'`latitude` Nullable(Float32) CODEC(Gorilla, LZ4)',
'`os` LowCardinality(String)',
'`os_version` LowCardinality(String)',
'`browser` LowCardinality(String)',
'`browser_version` LowCardinality(String)',
'`device` LowCardinality(String)',
'`brand` LowCardinality(String)',
'`model` LowCardinality(String)',
'`imported_at` Nullable(DateTime) CODEC(Delta(4), LZ4)',
// Additional metadata columns for import tracking
'`import_id` String CODEC(ZSTD(3))',
"`import_status` LowCardinality(String) DEFAULT 'pending'",
'`imported_at_meta` DateTime DEFAULT now()',
],
orderBy: ['import_id', 'created_at'],
partitionBy: 'toYYYYMM(imported_at_meta)',
settings: {
index_granularity: 8192,
},
distributionHash: 'cityHash64(import_id)',
replicatedVersion: '1',
isClustered,
}),
];
// Add TTL policy for auto-cleanup after 7 days
sqls.push(
modifyTTL({
tableName: 'events_imports',
isClustered,
ttl: 'imported_at_meta + INTERVAL 7 DAY',
}),
);
fs.writeFileSync(
path.join(__filename.replace('.ts', '.sql')),
sqls
.map((sql) =>
sql
.trim()
.replace(/;$/, '')
.replace(/\n{2,}/g, '\n')
.concat(';'),
)
.join('\n\n---\n\n'),
);
if (!process.argv.includes('--dry')) {
await runClickhouseMigrationCommands(sqls);
}
}

View File

@@ -1,5 +1,6 @@
export * from './src/prisma-client';
export * from './src/clickhouse/client';
export * from './src/clickhouse/csv';
export * from './src/sql-builder';
export * from './src/services/chart.service';
export * from './src/services/clients.service';
@@ -23,5 +24,6 @@ export * from './src/services/access.service';
export * from './src/buffers';
export * from './src/types';
export * from './src/clickhouse/query-builder';
export * from './src/services/import.service';
export * from './src/services/overview.service';
export * from './src/session-context';

View File

@@ -13,7 +13,7 @@
"with-env": "dotenv -e ../../.env -c --"
},
"dependencies": {
"@clickhouse/client": "^1.2.0",
"@clickhouse/client": "^1.12.1",
"@openpanel/common": "workspace:*",
"@openpanel/constants": "workspace:*",
"@openpanel/json": "workspace:*",

View File

@@ -0,0 +1,22 @@
-- CreateTable
CREATE TABLE "public"."imports" (
"id" UUID NOT NULL DEFAULT gen_random_uuid(),
"projectId" TEXT NOT NULL,
"provider" TEXT NOT NULL,
"sourceType" TEXT NOT NULL,
"sourceLocation" TEXT NOT NULL,
"jobId" TEXT,
"status" TEXT NOT NULL,
"config" JSONB NOT NULL DEFAULT '{}',
"totalEvents" INTEGER NOT NULL DEFAULT 0,
"processedEvents" INTEGER NOT NULL DEFAULT 0,
"errorMessage" TEXT,
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"completedAt" TIMESTAMP(3),
"updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT "imports_pkey" PRIMARY KEY ("id")
);
-- AddForeignKey
ALTER TABLE "public"."imports" ADD CONSTRAINT "imports_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "public"."projects"("id") ON DELETE CASCADE ON UPDATE CASCADE;

View File

@@ -0,0 +1,13 @@
/*
Warnings:
- You are about to drop the column `provider` on the `imports` table. All the data in the column will be lost.
- You are about to drop the column `sourceLocation` on the `imports` table. All the data in the column will be lost.
- You are about to drop the column `sourceType` on the `imports` table. All the data in the column will be lost.
*/
-- AlterTable
ALTER TABLE "public"."imports" DROP COLUMN "provider",
DROP COLUMN "sourceLocation",
DROP COLUMN "sourceType",
ALTER COLUMN "config" DROP DEFAULT;

View File

@@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "public"."imports" ADD COLUMN "statusMessage" TEXT;

View File

@@ -0,0 +1,3 @@
-- AlterTable
ALTER TABLE "public"."imports" ADD COLUMN "currentBatch" INTEGER NOT NULL DEFAULT 0,
ADD COLUMN "currentStep" TEXT;

View File

@@ -0,0 +1,14 @@
/*
Warnings:
- Changed the type of `status` on the `imports` table. No cast exists, the column would be dropped and recreated, which cannot be done if there is data, since the column is required.
- Made the column `currentStep` on table `imports` required. This step will fail if there are existing NULL values in that column.
*/
-- CreateEnum
CREATE TYPE "public"."ImportStatus" AS ENUM ('pending', 'processing', 'completed', 'failed');
-- AlterTable
ALTER TABLE "public"."imports" DROP COLUMN "status",
ADD COLUMN "status" "public"."ImportStatus" NOT NULL,
ALTER COLUMN "currentStep" SET NOT NULL;

View File

@@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "public"."imports" ALTER COLUMN "currentStep" DROP NOT NULL;

View File

@@ -0,0 +1,4 @@
-- AlterTable
ALTER TABLE "public"."imports" ALTER COLUMN "currentBatch" DROP NOT NULL,
ALTER COLUMN "currentBatch" DROP DEFAULT,
ALTER COLUMN "currentBatch" SET DATA TYPE TEXT;

View File

@@ -194,6 +194,7 @@ model Project {
notificationRules NotificationRule[]
notifications Notification[]
imports Import[]
// When deleteAt > now(), the project will be deleted
deleteAt DateTime?
@@ -467,3 +468,31 @@ model ResetPassword {
@@map("reset_password")
}
enum ImportStatus {
pending
processing
completed
failed
}
model Import {
id String @id @default(dbgenerated("gen_random_uuid()")) @db.Uuid
projectId String
project Project @relation(fields: [projectId], references: [id], onDelete: Cascade)
jobId String? // BullMQ job ID
status ImportStatus
statusMessage String? // Human-readable current step like "Importing events (Feb 2025)", "Generating session IDs"
errorMessage String?
/// [IPrismaImportConfig]
config Json
totalEvents Int @default(0)
processedEvents Int @default(0)
currentStep String?
currentBatch String? // String date 2020-01-01
createdAt DateTime @default(now())
completedAt DateTime?
updatedAt DateTime @default(now()) @updatedAt
@@map("imports")
}

View File

@@ -1,4 +1,4 @@
import { generateSecureId } from '@openpanel/common/server/id';
import { generateSecureId } from '@openpanel/common/server';
import { type ILogger, createLogger } from '@openpanel/logger';
import { getRedisCache, runEvery } from '@openpanel/redis';

View File

@@ -10,6 +10,18 @@ import {
} from 'vitest';
import { ch } from '../clickhouse/client';
const clickhouseSettings = {
async_insert: 1,
http_headers_progress_interval_ms: '50000',
input_format_parallel_parsing: 1,
max_execution_time: 300,
max_http_get_redirects: '0',
max_insert_block_size: '500000',
send_progress_in_http_headers: 1,
wait_end_of_query: 1,
wait_for_async_insert: 1,
};
// Mock transformEvent to avoid circular dependency with buffers -> services -> buffers
vi.mock('../services/event.service', () => ({
transformEvent: (event: any) => ({
@@ -127,6 +139,7 @@ describe('EventBuffer with real Redis', () => {
duration: 1000,
},
],
clickhouse_settings: clickhouseSettings,
});
const sessionKey = `event_buffer:session:${first.session_id}`;
@@ -171,6 +184,7 @@ describe('EventBuffer with real Redis', () => {
format: 'JSONEachRow',
table: 'events',
values: [first, end],
clickhouse_settings: clickhouseSettings,
});
const sessionKey = `event_buffer:session:${first.session_id}`;
const storedEvents = await redis.lrange(sessionKey, 0, -1);
@@ -502,6 +516,7 @@ describe('EventBuffer with real Redis', () => {
format: 'JSONEachRow',
table: 'events',
values: [end],
clickhouse_settings: clickhouseSettings,
});
const sessionKey = `event_buffer:session:${s}`;
@@ -552,6 +567,7 @@ describe('EventBuffer with real Redis', () => {
format: 'JSONEachRow',
table: 'events',
values: [view1, view2, view3, end],
clickhouse_settings: clickhouseSettings,
});
// Session should be completely empty and removed
@@ -596,6 +612,7 @@ describe('EventBuffer with real Redis', () => {
format: 'JSONEachRow',
table: 'events',
values: [{ ...view1, duration: 1000 }],
clickhouse_settings: clickhouseSettings,
});
// Session should be REMOVED from ready_sessions (only 1 event left)
@@ -620,6 +637,7 @@ describe('EventBuffer with real Redis', () => {
format: 'JSONEachRow',
table: 'events',
values: [{ ...view2, duration: 1000 }],
clickhouse_settings: clickhouseSettings,
});
// Session should be REMOVED again (only 1 event left)
@@ -667,6 +685,7 @@ describe('EventBuffer with real Redis', () => {
format: 'JSONEachRow',
table: 'events',
values: [view, end],
clickhouse_settings: clickhouseSettings,
});
// NOW it should be removed from ready_sessions (because it's empty)

View File

@@ -1,3 +1,4 @@
import { Readable } from 'node:stream';
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
import sqlstring from 'sqlstring';
@@ -23,13 +24,10 @@ type WarnLogParams = LogParams & { err?: Error };
class CustomLogger implements Logger {
trace({ message, args }: LogParams) {
logger.debug(message, args);
logger.info(message, args);
}
debug({ message, args }: LogParams) {
if (message.includes('Query:') && args?.response_status === 200) {
return;
}
logger.debug(message, args);
logger.info(message, args);
}
info({ message, args }: LogParams) {
logger.info(message, args);
@@ -56,14 +54,15 @@ export const TABLE_NAMES = {
event_property_values_mv: 'event_property_values_mv',
cohort_events_mv: 'cohort_events_mv',
sessions: 'sessions',
events_imports: 'events_imports',
};
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
max_open_connections: 30,
request_timeout: 60000,
request_timeout: 300000,
keep_alive: {
enabled: true,
idle_socket_ttl: 8000,
idle_socket_ttl: 60000,
},
compression: {
request: true,
@@ -87,7 +86,7 @@ const cleanQuery = (query?: string) =>
? query.replace(/\n/g, '').replace(/\s+/g, ' ').trim()
: undefined;
async function withRetry<T>(
export async function withRetry<T>(
operation: () => Promise<T>,
maxRetries = 3,
baseDelay = 500,
@@ -132,7 +131,34 @@ export const ch = new Proxy(originalCh, {
const value = Reflect.get(target, property, receiver);
if (property === 'insert') {
return (...args: any[]) => withRetry(() => value.apply(target, args));
return (...args: any[]) =>
withRetry(() => {
args[0].clickhouse_settings = {
// Allow bigger HTTP payloads/time to stream rows
async_insert: 1,
wait_for_async_insert: 1,
// Increase insert timeouts and buffer sizes for large batches
max_execution_time: 300,
max_insert_block_size: '500000',
max_http_get_redirects: '0',
// Ensure JSONEachRow stays efficient
input_format_parallel_parsing: 1,
// Keep long-running inserts/queries from idling out at proxies by sending progress headers
send_progress_in_http_headers: 1,
http_headers_progress_interval_ms: '50000',
// Ensure server holds the connection until the query is finished
wait_end_of_query: 1,
...args[0].clickhouse_settings,
};
return value.apply(target, args);
});
}
if (property === 'command') {
return (...args: any[]) =>
withRetry(() => {
return value.apply(target, args);
});
}
return value;
@@ -177,6 +203,34 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
return response;
}
export async function chInsertCSV(tableName: string, rows: string[]) {
try {
const now = performance.now();
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
const csvStream = Readable.from(rows.join('\n'), {
objectMode: false,
});
await ch.insert({
table: tableName,
values: csvStream,
format: 'CSV',
clickhouse_settings: {
format_csv_allow_double_quotes: 1,
format_csv_allow_single_quotes: 0,
},
});
logger.info('CSV Insert successful', {
elapsed: performance.now() - now,
rows: rows.length,
});
} catch (error) {
logger.error('CSV Insert failed:', error);
throw error;
}
}
export async function chQuery<T extends Record<string, any>>(
query: string,
clickhouseSettings?: ClickHouseSettings,

View File

@@ -0,0 +1,53 @@
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
// Format: '{'key1':'value1','key2':'value2'}'
// Single quotes inside values must be escaped with backslash: \'
// We also need to escape newlines and control characters to prevent CSV parsing issues
const escapeMapValue = (str: string) => {
return str
.replace(/\\/g, '\\\\') // Escape backslashes first
.replace(/'/g, "\\'") // Escape single quotes
.replace(/\n/g, '\\n') // Escape newlines
.replace(/\r/g, '\\r') // Escape carriage returns
.replace(/\t/g, '\\t') // Escape tabs
.replace(/\0/g, '\\0'); // Escape null bytes
};
export const csvEscapeJson = (
value: Record<string, unknown> | null | undefined,
): string => {
if (value == null) return '';
// Normalize to strings if your column is Map(String,String)
const normalized: Record<string, string> = Object.fromEntries(
Object.entries(value).map(([k, v]) => [
String(k),
v == null ? '' : String(v),
]),
);
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
if (Object.keys(normalized).length === 0) return '{}';
const pairs = Object.entries(normalized)
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
.join(',');
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
return csvEscapeField(`{${pairs}}`);
};
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
// Double quotes inside must be doubled (""), per CSV standard
export const csvEscapeField = (value: string | number): string => {
const str = String(value);
// If field contains commas, quotes, or newlines, it must be quoted
if (/[,"\n\r]/.test(str)) {
// Escape double quotes by doubling them
const escaped = str.replace(/"/g, '""');
return `"${escaped}"`;
}
return str;
};

View File

@@ -115,6 +115,22 @@ ENGINE = Distributed('{cluster}', currentDatabase(), ${replicated(tableName)}, $
];
}
export const modifyTTL = ({
tableName,
isClustered,
ttl,
}: {
tableName: string;
isClustered: boolean;
ttl: string;
}) => {
if (isClustered) {
return `ALTER TABLE ${replicated(tableName)} ON CLUSTER '{cluster}' MODIFY TTL ${ttl}`;
}
return `ALTER TABLE ${tableName} MODIFY TTL ${ttl}`;
};
/**
* Generates ALTER TABLE statements for adding columns
*/

View File

@@ -141,6 +141,10 @@ export type IServiceCreateEventPayload = Omit<
IServiceEvent,
'id' | 'importedAt' | 'profile' | 'meta'
>;
export type IServiceImportedEventPayload = Omit<
IServiceEvent,
'profile' | 'meta'
>;
export interface IServiceEvent {
id: string;

View File

@@ -0,0 +1,784 @@
import type { ILogger } from '@openpanel/logger';
import sqlstring from 'sqlstring';
import {
TABLE_NAMES,
ch,
chInsertCSV,
convertClickhouseDateToJs,
formatClickhouseDate,
} from '../clickhouse/client';
import { csvEscapeField, csvEscapeJson } from '../clickhouse/csv';
import { type Prisma, db } from '../prisma-client';
import type { IClickhouseEvent } from './event.service';
export interface ImportStageResult {
importId: string;
totalEvents: number;
insertedEvents: number;
}
export interface ImportProgress {
importId: string;
totalEvents: number;
insertedEvents: number;
status: 'pending' | 'processing' | 'processed' | 'failed';
}
/**
* Insert a batch of events into the imports staging table
*/
export async function insertImportBatch(
events: IClickhouseEvent[],
importId: string,
): Promise<ImportStageResult> {
if (events.length === 0) {
return { importId, totalEvents: 0, insertedEvents: 0 };
}
// Important to have same order as events_imports table
// CSV format: properly quotes fields that need it
const csvRows = events.map((event) => {
// Properties need to be converted to JSON for Map(String, String)
// All fields must be CSV-escaped when joining with commas
const fields = [
csvEscapeField(event.id || ''),
csvEscapeField(event.name),
csvEscapeField(event.sdk_name || ''),
csvEscapeField(event.sdk_version || ''),
csvEscapeField(event.device_id || ''),
csvEscapeField(event.profile_id || ''),
csvEscapeField(event.project_id || ''),
csvEscapeField(event.session_id || ''),
csvEscapeField(event.path),
csvEscapeField(event.origin || ''),
csvEscapeField(event.referrer || ''),
csvEscapeField(event.referrer_name || ''),
csvEscapeField(event.referrer_type || ''),
csvEscapeField(event.duration ?? 0),
csvEscapeJson(event.properties),
csvEscapeField(event.created_at),
csvEscapeField(event.country || ''),
csvEscapeField(event.city || ''),
csvEscapeField(event.region || ''),
csvEscapeField(event.longitude != null ? event.longitude : '\\N'),
csvEscapeField(event.latitude != null ? event.latitude : '\\N'),
csvEscapeField(event.os || ''),
csvEscapeField(event.os_version || ''),
csvEscapeField(event.browser || ''),
csvEscapeField(event.browser_version || ''),
csvEscapeField(event.device || ''),
csvEscapeField(event.brand || ''),
csvEscapeField(event.model || ''),
csvEscapeField('\\N'), // imported_at (Nullable)
csvEscapeField(importId),
csvEscapeField('pending'), // import_status
csvEscapeField(formatClickhouseDate(new Date())), // imported_at_meta (DateTime, not DateTime64, so no milliseconds)
];
return fields.join(',');
});
await chInsertCSV(TABLE_NAMES.events_imports, csvRows);
return {
importId,
totalEvents: events.length,
insertedEvents: events.length,
};
}
/**
* Generate deterministic session IDs for events that don't have them
* Uses 30-minute time windows to create consistent session IDs across imports
* Only processes events where device != 'server' and session_id = ''
*/
export async function generateSessionIds(
importId: string,
from: string,
): Promise<void> {
const rangeWhere = [
'import_id = {importId:String}',
"import_status = 'pending'",
"device != 'server'",
"session_id = ''",
from ? 'toDate(created_at) = {from:String}' : '',
]
.filter(Boolean)
.join(' AND ');
// Use SQL to generate deterministic session IDs based on device_id + 30-min time windows
// This ensures same events always get same session IDs regardless of import order
const updateQuery = `
ALTER TABLE ${TABLE_NAMES.events_imports}
UPDATE session_id = lower(hex(MD5(concat(
device_id,
'-',
toString(toInt64(toUnixTimestamp(created_at) / 1800))
))))
WHERE ${rangeWhere}
`;
await ch.command({
query: updateQuery,
query_params: { importId, from },
clickhouse_settings: {
wait_end_of_query: 1,
mutations_sync: '2', // Wait for mutation to complete on all replicas (critical!)
send_progress_in_http_headers: 1,
http_headers_progress_interval_ms: '50000',
},
});
}
/**
* Reconstruct sessions using SQL-based logic
* This identifies session boundaries and creates session_start/session_end events
* session_start inherits all properties from the first event in the session
* session_end inherits all properties from the last event in the session and calculates duration
*/
export async function createSessionsStartEndEvents(
importId: string,
from: string,
): Promise<void> {
// First, let's identify session boundaries and get first/last events for each session
const rangeWhere = [
'import_id = {importId:String}',
"import_status = 'pending'",
"session_id != ''", // Only process events that have session IDs
'toDate(created_at) = {from:String}',
]
.filter(Boolean)
.join(' AND ');
// Use window functions to efficiently get first event (all fields) and last event (only changing fields)
// session_end only needs: properties, path, origin, created_at - the rest can be inherited from session_start
const sessionEventsQuery = `
SELECT
device_id,
session_id,
project_id,
profile_id,
argMin((path, origin, referrer, referrer_name, referrer_type, properties, created_at, country, city, region, longitude, latitude, os, os_version, browser, browser_version, device, brand, model), created_at) AS first_event,
argMax((path, origin, properties, created_at), created_at) AS last_event_fields,
min(created_at) AS first_timestamp,
max(created_at) AS last_timestamp
FROM ${TABLE_NAMES.events_imports}
WHERE ${rangeWhere}
AND name NOT IN ('session_start', 'session_end')
GROUP BY session_id, device_id, project_id, profile_id
`;
const sessionEventsResult = await ch.query({
query: sessionEventsQuery,
query_params: { importId, from },
format: 'JSONEachRow',
});
const sessionData = (await sessionEventsResult.json()) as Array<{
device_id: string;
session_id: string;
project_id: string;
profile_id: string;
first_event: [
// string, // id
// string, // name
string, // path
string, // origin
string, // referrer
string, // referrer_name
string, // referrer_type
// number, // duration
Record<string, unknown>, // properties
string, // created_at
string, // country
string, // city
string, // region
number | null, // longitude
number | null, // latitude
string, // os
string, // os_version
string, // browser
string, // browser_version
string, // device
string, // brand
string, // model
// string, // sdk_name
// string, // sdk_version
// string, // imported_at
];
last_event_fields: [
string, // path
string, // origin
Record<string, unknown>, // properties
string, // created_at
];
first_timestamp: string;
last_timestamp: string;
}>;
// Create session_start and session_end events
const sessionEvents: IClickhouseEvent[] = [];
for (const session of sessionData) {
// Destructure first event tuple (all fields)
const [
// firstId,
// firstName,
firstPath,
firstOrigin,
firstReferrer,
firstReferrerName,
firstReferrerType,
// firstDuration,
firstProperties,
firstCreatedAt,
firstCountry,
firstCity,
firstRegion,
firstLongitude,
firstLatitude,
firstOs,
firstOsVersion,
firstBrowser,
firstBrowserVersion,
firstDevice,
firstBrand,
firstModel,
// firstSdkName,
// firstSdkVersion,
// firstImportedAt,
] = session.first_event;
// Destructure last event fields (only the changing ones)
const [lastPath, lastOrigin, lastProperties, lastCreatedAt] =
session.last_event_fields;
// Calculate duration in milliseconds
// Parse timestamps as Date objects to calculate duration
const firstTime = new Date(session.first_timestamp).getTime();
const lastTime = new Date(session.last_timestamp).getTime();
const durationMs = lastTime - firstTime;
// Helper function to adjust timestamp by milliseconds without timezone conversion
const adjustTimestamp = (timestamp: string, offsetMs: number): string => {
// Parse the timestamp, adjust it, and format back to ClickHouse format
const date = convertClickhouseDateToJs(timestamp);
date.setTime(date.getTime() + offsetMs);
return formatClickhouseDate(date);
};
// Create session_start event - inherit everything from first event but change name
// Set created_at to 1 second before the first event
sessionEvents.push({
id: crypto.randomUUID(),
name: 'session_start',
device_id: session.device_id,
profile_id: session.profile_id,
project_id: session.project_id,
session_id: session.session_id,
path: firstPath,
origin: firstOrigin,
referrer: firstReferrer,
referrer_name: firstReferrerName,
referrer_type: firstReferrerType,
duration: 0, // session_start always has 0 duration
properties: firstProperties as Record<
string,
string | number | boolean | null | undefined
>,
created_at: adjustTimestamp(session.first_timestamp, -1000), // 1 second before first event
country: firstCountry,
city: firstCity,
region: firstRegion,
longitude: firstLongitude,
latitude: firstLatitude,
os: firstOs,
os_version: firstOsVersion,
browser: firstBrowser,
browser_version: firstBrowserVersion,
device: firstDevice,
brand: firstBrand,
model: firstModel,
imported_at: new Date().toISOString(),
sdk_name: 'import-session-reconstruction',
sdk_version: '1.0.0',
});
// Create session_end event - inherit most from session_start, but use last event's path, origin, properties
// Set created_at to 1 second after the last event
sessionEvents.push({
id: crypto.randomUUID(),
name: 'session_end',
device_id: session.device_id,
profile_id: session.profile_id,
project_id: session.project_id,
session_id: session.session_id,
path: lastPath, // From last event
origin: lastOrigin, // From last event
referrer: firstReferrer, // Same as session_start
referrer_name: firstReferrerName, // Same as session_start
referrer_type: firstReferrerType, // Same as session_start
duration: durationMs,
properties: lastProperties as Record<
string,
string | number | boolean | null | undefined
>, // From last event
created_at: adjustTimestamp(session.last_timestamp, 500), // 1 second after last event
country: firstCountry, // Same as session_start
city: firstCity, // Same as session_start
region: firstRegion, // Same as session_start
longitude: firstLongitude, // Same as session_start
latitude: firstLatitude, // Same as session_start
os: firstOs, // Same as session_start
os_version: firstOsVersion, // Same as session_start
browser: firstBrowser, // Same as session_start
browser_version: firstBrowserVersion, // Same as session_start
device: firstDevice, // Same as session_start
brand: firstBrand, // Same as session_start
model: firstModel, // Same as session_start
imported_at: new Date().toISOString(),
sdk_name: 'import-session-reconstruction',
sdk_version: '1.0.0',
});
}
// Insert session events into imports table
if (sessionEvents.length > 0) {
await insertImportBatch(sessionEvents, importId);
}
}
/**
* Migrate all events from imports table to production events table
* This includes both original events and generated session events
*/
export async function moveImportsToProduction(
importId: string,
from: string,
): Promise<void> {
// Build the WHERE clause for migration
// For session events (session_start/session_end), we don't filter by their created_at
// because they're created with adjusted timestamps (±1 second) that might fall outside
// the date range. Instead, we include them if their session_id has events in this range.
let whereClause = 'import_id = {importId:String}';
if (from) {
whereClause += ` AND (
(toDate(created_at) = {from:String}) OR
(
name IN ('session_start', 'session_end') AND
session_id IN (
SELECT DISTINCT session_id
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
AND toDate(created_at) = {from:String}
AND name NOT IN ('session_start', 'session_end')
)
)
)`;
}
const migrationQuery = `
INSERT INTO ${TABLE_NAMES.events} (
id,
name,
sdk_name,
sdk_version,
device_id,
profile_id,
project_id,
session_id,
path,
origin,
referrer,
referrer_name,
referrer_type,
duration,
properties,
created_at,
country,
city,
region,
longitude,
latitude,
os,
os_version,
browser,
browser_version,
device,
brand,
model,
imported_at
)
SELECT
id,
name,
sdk_name,
sdk_version,
device_id,
profile_id,
project_id,
session_id,
path,
origin,
referrer,
referrer_name,
referrer_type,
duration,
properties,
created_at,
country,
city,
region,
longitude,
latitude,
os,
os_version,
browser,
browser_version,
device,
brand,
model,
imported_at
FROM ${TABLE_NAMES.events_imports}
WHERE ${whereClause}
ORDER BY created_at ASC
`;
await ch.command({
query: migrationQuery,
query_params: { importId, from },
clickhouse_settings: {
wait_end_of_query: 1,
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
send_progress_in_http_headers: 1,
// The interval of sending these progress headers. Here it is less than 60s,
http_headers_progress_interval_ms: '50000',
},
});
}
export async function backfillSessionsToProduction(
importId: string,
from: string,
): Promise<void> {
// After migrating events, populate the sessions table based on the migrated sessions
// We detect all session_ids involved in this import from the imports table,
// then aggregate over the production events to construct session rows.
const sessionsInsertQuery = `
INSERT INTO ${TABLE_NAMES.sessions} (
id,
project_id,
profile_id,
device_id,
created_at,
ended_at,
is_bounce,
entry_origin,
entry_path,
exit_origin,
exit_path,
screen_view_count,
revenue,
event_count,
duration,
country,
region,
city,
longitude,
latitude,
device,
brand,
model,
browser,
browser_version,
os,
os_version,
sign,
version,
properties,
utm_medium,
utm_source,
utm_campaign,
utm_content,
utm_term,
referrer,
referrer_name,
referrer_type
)
SELECT
any(e.session_id) as id,
any(e.project_id) as project_id,
if(any(nullIf(e.profile_id, e.device_id)) IS NULL, any(e.profile_id), any(nullIf(e.profile_id, e.device_id))) as profile_id,
any(e.device_id) as device_id,
argMin(e.created_at, e.created_at) as created_at,
argMax(e.created_at, e.created_at) as ended_at,
if(
argMaxIf(e.properties['__bounce'], e.created_at, e.name = 'session_end') = '',
if(countIf(e.name = 'screen_view') > 1, true, false),
argMaxIf(e.properties['__bounce'], e.created_at, e.name = 'session_end') = 'true'
) as is_bounce,
argMinIf(e.origin, e.created_at, e.name = 'session_start') as entry_origin,
argMinIf(e.path, e.created_at, e.name = 'session_start') as entry_path,
argMaxIf(e.origin, e.created_at, e.name = 'session_end' OR e.name = 'screen_view') as exit_origin,
argMaxIf(e.path, e.created_at, e.name = 'session_end' OR e.name = 'screen_view') as exit_path,
countIf(e.name = 'screen_view') as screen_view_count,
0 as revenue,
countIf(e.name != 'screen_view' AND e.name != 'session_start' AND e.name != 'session_end') as event_count,
sumIf(e.duration, name = 'session_end') AS duration,
argMinIf(e.country, e.created_at, e.name = 'session_start') as country,
argMinIf(e.region, e.created_at, e.name = 'session_start') as region,
argMinIf(e.city, e.created_at, e.name = 'session_start') as city,
argMinIf(e.longitude, e.created_at, e.name = 'session_start') as longitude,
argMinIf(e.latitude, e.created_at, e.name = 'session_start') as latitude,
argMinIf(e.device, e.created_at, e.name = 'session_start') as device,
argMinIf(e.brand, e.created_at, e.name = 'session_start') as brand,
argMinIf(e.model, e.created_at, e.name = 'session_start') as model,
argMinIf(e.browser, e.created_at, e.name = 'session_start') as browser,
argMinIf(e.browser_version, e.created_at, e.name = 'session_start') as browser_version,
argMinIf(e.os, e.created_at, e.name = 'session_start') as os,
argMinIf(e.os_version, e.created_at, e.name = 'session_start') as os_version,
1 as sign,
1 as version,
argMinIf(e.properties, e.created_at, e.name = 'session_start') as properties,
argMinIf(e.properties['__query.utm_medium'], e.created_at, e.name = 'session_start') as utm_medium,
argMinIf(e.properties['__query.utm_source'], e.created_at, e.name = 'session_start') as utm_source,
argMinIf(e.properties['__query.utm_campaign'], e.created_at, e.name = 'session_start') as utm_campaign,
argMinIf(e.properties['__query.utm_content'], e.created_at, e.name = 'session_start') as utm_content,
argMinIf(e.properties['__query.utm_term'], e.created_at, e.name = 'session_start') as utm_term,
argMinIf(e.referrer, e.created_at, e.name = 'session_start') as referrer,
argMinIf(e.referrer_name, e.created_at, e.name = 'session_start') as referrer_name,
argMinIf(e.referrer_type, e.created_at, e.name = 'session_start') as referrer_type
FROM ${TABLE_NAMES.events_imports} e
WHERE
e.import_id = ${sqlstring.escape(importId)}
AND toDate(e.created_at) = ${sqlstring.escape(from)}
AND e.session_id != ''
GROUP BY e.session_id
`;
await ch.command({
query: sessionsInsertQuery,
clickhouse_settings: {
wait_end_of_query: 1,
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
send_progress_in_http_headers: 1,
// The interval of sending these progress headers. Here it is less than 60s,
http_headers_progress_interval_ms: '50000',
},
});
}
/**
* Mark import as complete by updating status
*/
export async function markImportComplete(importId: string): Promise<void> {
const updateQuery = `
ALTER TABLE ${TABLE_NAMES.events_imports}
UPDATE import_status = 'processed'
WHERE import_id = {importId:String}
`;
await ch.command({
query: updateQuery,
query_params: { importId },
clickhouse_settings: {
wait_end_of_query: 1,
mutations_sync: '2', // Wait for mutation to complete
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
send_progress_in_http_headers: 1,
// The interval of sending these progress headers. Here it is less than 60s,
http_headers_progress_interval_ms: '50000',
},
});
}
/**
* Get import progress and status
*/
export async function getImportProgress(
importId: string,
): Promise<ImportProgress> {
const progressQuery = `
SELECT
import_id,
COUNT(*) as total_events,
COUNTIf(import_status = 'pending') as pending_events,
COUNTIf(import_status = 'processed') as processed_events,
any(import_status) as status
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
AND name NOT IN ('session_start', 'session_end')
GROUP BY import_id
`;
const result = await ch.query({
query: progressQuery,
query_params: { importId },
format: 'JSONEachRow',
});
const data = (await result.json()) as Array<{
import_id: string;
total_events: number;
pending_events: number;
processed_events: number;
status: string;
}>;
if (data.length === 0) {
return {
importId,
totalEvents: 0,
insertedEvents: 0,
status: 'pending',
};
}
const row = data[0];
if (!row) {
return {
importId,
totalEvents: 0,
insertedEvents: 0,
status: 'pending',
};
}
return {
importId,
totalEvents: row.total_events,
insertedEvents: row.processed_events,
status: row.status as 'pending' | 'processing' | 'processed' | 'failed',
};
}
/**
* Utility: get min/max created_at for an import
*/
export async function getImportDateBounds(
importId: string,
fromCreatedAt?: string,
): Promise<{ min: string | null; max: string | null }> {
const res = await ch.query({
query: `
SELECT min(created_at) AS min, max(created_at) AS max
FROM ${TABLE_NAMES.events_imports}
WHERE import_id = {importId:String}
${fromCreatedAt ? 'AND created_at >= {fromCreatedAt:String}' : ''}
`,
query_params: { importId, fromCreatedAt },
format: 'JSONEachRow',
});
const rows = (await res.json()) as Array<{
min: string | null;
max: string | null;
}>;
return rows.length > 0
? {
min: fromCreatedAt ?? rows[0]?.min ?? null,
max: rows[0]?.max ?? null,
}
: { min: null, max: null };
}
/**
* Unified method to update all import status information
* Combines step, batch, progress, and status message updates
*/
export type UpdateImportStatusOptions =
| {
step: 'loading';
batch?: string;
totalEvents?: number;
processedEvents?: number;
}
| {
step: 'generating_session_ids';
batch?: string;
}
| {
step: 'creating_sessions';
batch?: string;
}
| {
step: 'moving';
batch?: string;
}
| {
step: 'backfilling_sessions';
batch?: string;
}
| {
step: 'completed';
}
| {
step: 'failed';
errorMessage?: string;
};
export type ImportSteps = UpdateImportStatusOptions['step'];
export async function updateImportStatus(
jobLogger: ILogger,
job: {
updateProgress: (progress: Record<string, any>) => void;
},
importId: string,
options: UpdateImportStatusOptions,
): Promise<void> {
const data: Prisma.ImportUpdateInput = {};
switch (options.step) {
case 'loading':
data.status = 'processing';
data.currentStep = 'loading';
data.currentBatch = options.batch;
data.statusMessage = options.batch
? `Importing events from ${options.batch}`
: 'Initializing...';
data.totalEvents = options.totalEvents;
data.processedEvents = options.processedEvents;
break;
case 'generating_session_ids':
data.currentStep = 'generating_session_ids';
data.currentBatch = options.batch;
data.statusMessage = options.batch
? `Generating session IDs for ${options.batch}`
: 'Generating session IDs...';
break;
case 'creating_sessions':
data.currentStep = 'creating_sessions';
data.currentBatch = options.batch;
data.statusMessage = `Creating sessions for ${options.batch}`;
break;
case 'moving':
data.currentStep = 'moving';
data.currentBatch = options.batch;
data.statusMessage = `Moving imports to production for ${options.batch}`;
break;
case 'backfilling_sessions':
data.currentStep = 'backfilling_sessions';
data.currentBatch = options.batch;
data.statusMessage = `Aggregating sessions for ${options.batch}`;
break;
case 'completed':
data.status = 'completed';
data.currentStep = 'completed';
data.statusMessage = 'Import completed';
data.completedAt = new Date();
break;
case 'failed':
data.status = 'failed';
data.statusMessage = 'Import failed';
data.errorMessage = options.errorMessage;
break;
}
jobLogger.info('Import status update', data);
await job.updateProgress(data);
await db.import.update({
where: { id: importId },
data,
});
}

View File

@@ -196,7 +196,7 @@ export async function getSessionList({
organization?.subscriptionPeriodEventsLimit &&
organization?.subscriptionPeriodEventsLimit > 1_000_000
? 1
: 7;
: 360;
if (cursor) {
const cAt = sqlstring.escape(cursor.createdAt);

View File

@@ -1,4 +1,5 @@
import type {
IImportConfig,
IIntegrationConfig,
INotificationRuleConfig,
IProjectFilters,
@@ -12,6 +13,7 @@ import type { IClickhouseProfile } from './services/profile.service';
declare global {
namespace PrismaJson {
type IPrismaImportConfig = IImportConfig;
type IPrismaNotificationRuleConfig = INotificationRuleConfig;
type IPrismaIntegrationConfig = IIntegrationConfig;
type IPrismaNotificationPayload = INotificationPayload;

View File

@@ -0,0 +1,35 @@
{
"name": "@openpanel/importer",
"version": "0.0.0",
"type": "module",
"main": "src/index.ts",
"scripts": {
"build": "tsc",
"dev": "tsc --watch",
"test": "vitest",
"test:run": "vitest run"
},
"exports": {
".": "./src/index.ts",
"./providers": "./src/providers/metadata.ts"
},
"dependencies": {
"@openpanel/common": "workspace:*",
"@openpanel/db": "workspace:*",
"@openpanel/queue": "workspace:*",
"@openpanel/validation": "workspace:*",
"csv-parse": "^6.1.0",
"ramda": "^0.29.1",
"uuid": "^9.0.1",
"zod": "catalog:"
},
"devDependencies": {
"@openpanel/logger": "workspace:*",
"@types/node": "^20.0.0",
"@types/ramda": "^0.31.1",
"@types/uuid": "^9.0.7",
"bullmq": "^5.8.7",
"typescript": "^5.0.0",
"vitest": "^1.0.0"
}
}

View File

@@ -0,0 +1,121 @@
import type { IClickhouseEvent } from '@openpanel/db';
import type { BaseRawEvent, ErrorContext, ImportJobMetadata } from './types';
export abstract class BaseImportProvider<
TRawEvent extends BaseRawEvent = BaseRawEvent,
> {
abstract provider: string;
abstract version: string;
/**
* Stream-read and parse source (file/API) → yields raw events
* This should be implemented as an async generator to handle large files efficiently
*/
abstract parseSource(
overrideFrom?: string,
): AsyncGenerator<TRawEvent, void, unknown>;
/**
* Convert provider format → IClickhouseEvent
*/
abstract transformEvent(rawEvent: TRawEvent): IClickhouseEvent;
/**
* Validate raw event structure
*/
abstract validate(rawEvent: TRawEvent): boolean;
/**
* Returns how many events will be imported
*/
abstract getTotalEventsCount(): Promise<number>;
/**
* Optional hook: Pre-process batch
*/
async beforeBatch?(events: TRawEvent[]): Promise<TRawEvent[]> {
return events;
}
/**
* Optional hook: Get import metadata for tracking
*/
getImportMetadata?(): ImportJobMetadata;
/**
* Optional hook: Custom error handling
*/
async onError?(error: Error, context?: ErrorContext): Promise<void> {
// Default: re-throw
throw error;
}
/**
* Get estimated total events (optional, for progress tracking)
*/
async getEstimatedTotal?(): Promise<number> {
return 0;
}
/**
* Indicates whether session IDs should be generated in SQL after import
* If true, the import job will generate deterministic session IDs based on
* device_id and timestamp using SQL window functions
* If false, assumes the provider already generates session IDs during streaming
*/
shouldGenerateSessionIds(): boolean {
return false; // Default: assume provider handles it
}
/**
* Utility: Split a date range into chunks to avoid timeout issues with large imports
* Returns array of [from, to] date pairs in YYYY-MM-DD format
*
* @param from - Start date in YYYY-MM-DD format
* @param to - End date in YYYY-MM-DD format
* @param chunkSizeDays - Number of days per chunk (default: 1)
*/
public getDateChunks(
from: string,
to: string,
options?: {
chunkSizeDays?: number;
},
): Array<[string, string]> {
const chunks: Array<[string, string]> = [];
const startDate = new Date(from);
const endDate = new Date(to);
const chunkSizeDays = options?.chunkSizeDays ?? 1;
// Handle case where from and to are the same date
if (startDate.getTime() === endDate.getTime()) {
return [[from, to]];
}
const cursor = new Date(startDate);
while (cursor <= endDate) {
const chunkStart = cursor.toISOString().split('T')[0]!;
// Calculate chunk end: move forward by (chunkSizeDays - 1) to get the last day of the chunk
const chunkEndDate = new Date(cursor);
chunkEndDate.setDate(chunkEndDate.getDate() + (chunkSizeDays - 1));
// Don't go past the end date
const chunkEnd =
chunkEndDate > endDate
? endDate.toISOString().split('T')[0]!
: chunkEndDate.toISOString().split('T')[0]!;
chunks.push([chunkStart, chunkEnd]);
// Move cursor to the next chunk start (after the current chunk)
cursor.setDate(cursor.getDate() + chunkSizeDays);
if (cursor > endDate) break;
}
return chunks;
}
}

View File

@@ -0,0 +1,13 @@
export { UmamiProvider } from './providers/umami';
export { MixpanelProvider } from './providers/mixpanel';
export type {
ImportConfig,
ImportProgress,
ImportResult,
BatchResult,
BaseRawEvent,
ErrorContext,
EventProperties,
ImportJobMetadata,
ImportStageResult,
} from './types';

View File

@@ -0,0 +1,30 @@
export type ImportProviderId = 'umami' | 'mixpanel';
export type ImportProviderType = 'file' | 'api';
export interface ImportProviderMeta {
id: ImportProviderId;
name: string;
description: string;
logo: string;
backgroundColor: string;
types: ImportProviderType[];
}
export const IMPORT_PROVIDERS: ImportProviderMeta[] = [
{
id: 'umami',
name: 'Umami',
description: 'Import your analytics data from Umami',
logo: 'https://cdn.brandfetch.io/id_3VEohOm/w/180/h/180/theme/dark/logo.png?c=1dxbfHSJFAPEGdCLU4o5B',
backgroundColor: '#fff',
types: ['file'],
},
{
id: 'mixpanel',
name: 'Mixpanel',
description: 'Import your analytics data from Mixpanel API',
logo: 'https://cdn.brandfetch.io/idr_rhI2FS/theme/dark/idMJ8uODLv.svg?c=1dxbfHSJFAPEGdCLU4o5B',
backgroundColor: '#fff',
types: ['api'],
},
];

View File

@@ -0,0 +1,319 @@
import { omit } from 'ramda';
import { describe, expect, it } from 'vitest';
import { MixpanelProvider } from './mixpanel';
describe('mixpanel', () => {
it('should chunk date range into day chunks', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-04',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const chunks = provider.getDateChunks('2025-01-01', '2025-01-04');
expect(chunks).toEqual([
['2025-01-01', '2025-01-01'],
['2025-01-02', '2025-01-02'],
['2025-01-03', '2025-01-03'],
['2025-01-04', '2025-01-04'],
]);
});
it('should transform event', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-02',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const rawEvent = {
event: '$mp_web_page_view',
properties: {
time: 1746097970,
distinct_id: '$device:123',
$browser: 'Chrome',
$browser_version: 135,
$city: 'Mumbai',
$current_url:
'https://domain.com/state/maharashtra?utm_source=google&utm_medium=cpc&utm_campaignid=890&utm_adgroupid=&utm_adid=&utm_term=&utm_device=m&utm_network=x&utm_location=123&gclid=oqneoqow&gad_sour',
$device: 'Android',
$device_id: '123',
$initial_referrer: 'https://referrer.com/',
$initial_referring_domain: 'referrer.com',
$insert_id: 'source_id',
$lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1746078175363,
$mp_autocapture: true,
$os: 'Android',
$referrer: 'https://google.com/',
$referring_domain: 'referrer.com',
$region: 'Maharashtra',
$screen_height: 854,
$screen_width: 384,
current_domain: 'domain.com',
current_page_title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
current_url_path: '/state/maharashtra',
current_url_protocol: 'https:',
current_url_search:
'?utm_source=google&utm_medium=cpc&utm_campaignid=890&utm_adgroupid=&utm_adid=&utm_term=&utm_device=m&utm_network=x&utm_location=123&gclid=oqneoqow&gad_source=5&gclid=EAIaIQobChMI6MnvhciBjQMVlS-DAx',
gclid: 'oqneoqow',
mp_country_code: 'IN',
mp_lib: 'web',
mp_processing_time_ms: 1746078175546,
mp_sent_by_lib_version: '2.60.0',
utm_medium: 'cpc',
utm_source: 'google',
},
};
const res = provider.transformEvent(rawEvent);
expect(res).toMatchObject({
id: expect.any(String),
name: 'screen_view',
device_id: '123',
profile_id: '123',
project_id: 'pid',
session_id: '',
properties: {
__source_insert_id: 'source_id',
__screen: '384x854',
__lib_version: '2.60.0',
'__query.utm_source': 'google',
'__query.utm_medium': 'cpc',
'__query.utm_campaignid': '890',
'__query.utm_device': 'm',
'__query.utm_network': 'x',
'__query.utm_location': '123',
'__query.gclid': 'oqneoqow',
__title:
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
},
created_at: '2025-05-01T11:12:50.000Z',
country: 'IN',
city: 'Mumbai',
region: 'Maharashtra',
longitude: null,
latitude: null,
os: 'Android',
os_version: undefined,
browser: 'Chrome',
browser_version: '',
device: 'mobile',
brand: '',
model: '',
duration: 0,
path: '/state/maharashtra',
origin: 'https://domain.com',
referrer: 'https://referrer.com',
referrer_name: 'Google',
referrer_type: 'search',
imported_at: expect.any(String),
sdk_name: 'mixpanel (web)',
sdk_version: '1.0.0',
});
});
it('should parse stringified JSON in properties and flatten them', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-02',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const rawEvent = {
event: 'custom_event',
properties: {
time: 1746097970,
distinct_id: '$device:123',
$device_id: '123',
$user_id: 'user123',
mp_lib: 'web',
// Stringified JSON object - should be parsed and flattened
area: '{"displayText":"Malab, Nuh, Mewat","id":1189005}',
// Stringified JSON array - should be parsed and flattened
tags: '["tag1","tag2","tag3"]',
// Regular string - should remain as is
regularString: 'just a string',
// Number - should be converted to string
count: 42,
// Object - should be flattened
nested: { level1: { level2: 'value' } },
},
};
const res = provider.transformEvent(rawEvent);
expect(res.properties).toMatchObject({
// Parsed JSON object should be flattened with dot notation
'area.displayText': 'Malab, Nuh, Mewat',
'area.id': '1189005',
// Parsed JSON array should be flattened with numeric indices
'tags.0': 'tag1',
'tags.1': 'tag2',
'tags.2': 'tag3',
// Regular values
regularString: 'just a string',
count: '42',
// Nested object flattened
'nested.level1.level2': 'value',
});
});
it('should handle react-native referrer', async () => {
const provider = new MixpanelProvider('pid', {
from: '2025-01-01',
to: '2025-01-02',
serviceAccount: 'sa',
serviceSecret: 'ss',
projectId: '123',
provider: 'mixpanel',
type: 'api',
mapScreenViewProperty: undefined,
});
const rawEvent = {
event: 'ec_search_error',
properties: {
time: 1759947367,
distinct_id: '3385916',
$browser: 'Mobile Safari',
$browser_version: null,
$city: 'Bengaluru',
$current_url:
'https://web.landeed.com/karnataka/ec-encumbrance-certificate',
$device: 'iPhone',
$device_id:
'199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
$initial_referrer: 'https://www.google.com/',
$initial_referring_domain: 'www.google.com',
$insert_id: 'bclkaepeqcfuzt4v',
$lib_version: '2.60.0',
$mp_api_endpoint: 'api-js.mixpanel.com',
$mp_api_timestamp_ms: 1759927570699,
$os: 'iOS',
$region: 'Karnataka',
$screen_height: 852,
$screen_width: 393,
$search_engine: 'google',
$user_id: '3385916',
binaryReadableVersion: 'NA',
binaryVersion: 'NA',
component: '/karnataka/ec-encumbrance-certificate',
errMsg: 'Request failed with status code 500',
errType: 'SERVER_ERROR',
isSilentSearch: false,
isTimeout: false,
jsVersion: '0.42.0',
language: 'english',
mp_country_code: 'IN',
mp_lib: 'web',
mp_processing_time_ms: 1759927592421,
mp_sent_by_lib_version: '2.60.0',
os: 'web',
osVersion:
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
phoneBrand: 'NA',
phoneManufacturer: 'NA',
phoneModel: 'NA',
searchUuid: '68e65d08-fd81-4ded-37d3-2b08d2bc70c3',
serverVersion: 'web2.0',
state: 17,
stateStr: '17',
statusCode: 500,
type: 'result_event',
utm_medium: 'cpc',
utm_source:
'google%26utm_medium=cpc%26utm_campaignid=21380769590%26utm_adgroupid=%26utm_adid=%26utm_term=%26utm_device=m%26utm_network=%26utm_location=9062055%26gclid=%26gad_campaignid=21374496705%26gbraid=0AAAAAoV7mTM9mWFripzQ2Od0xXAfrW6p3%26wbraid=CmAKCQjwi4PHBhCUA',
},
};
const res = provider.transformEvent(rawEvent);
expect(res.id.length).toBeGreaterThan(30);
expect(res.imported_at).toMatch(
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
);
expect(omit(['id', 'imported_at'], res)).toEqual({
brand: 'Apple',
browser: 'GSA',
browser_version: 'null',
city: 'Bengaluru',
country: 'IN',
created_at: '2025-10-08T18:16:07.000Z',
device: 'mobile',
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
duration: 0,
latitude: null,
longitude: null,
model: 'iPhone',
name: 'ec_search_error',
origin: 'https://web.landeed.com',
os: 'iOS',
os_version: '18.7.0',
path: '/karnataka/ec-encumbrance-certificate',
profile_id: '3385916',
project_id: 'pid',
properties: {
__lib_version: '2.60.0',
'__query.gad_campaignid': '21374496705',
'__query.gbraid': '0AAAAAoV7mTM9mWFripzQ2Od0xXAfrW6p3',
'__query.utm_campaignid': '21380769590',
'__query.utm_device': 'm',
'__query.utm_location': '9062055',
'__query.utm_medium': 'cpc',
'__query.utm_source': 'google',
'__query.wbraid': 'CmAKCQjwi4PHBhCUA',
__screen: '393x852',
__source_insert_id: 'bclkaepeqcfuzt4v',
__userAgent:
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
binaryReadableVersion: 'NA',
binaryVersion: 'NA',
component: '/karnataka/ec-encumbrance-certificate',
errMsg: 'Request failed with status code 500',
errType: 'SERVER_ERROR',
isSilentSearch: 'false',
isTimeout: 'false',
jsVersion: '0.42.0',
language: 'english',
os: 'web',
osVersion:
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
phoneBrand: 'NA',
phoneManufacturer: 'NA',
phoneModel: 'NA',
searchUuid: '68e65d08-fd81-4ded-37d3-2b08d2bc70c3',
serverVersion: 'web2.0',
state: '17',
stateStr: '17',
statusCode: '500',
type: 'result_event',
},
referrer: 'https://www.google.com',
referrer_name: 'Google',
referrer_type: 'search',
region: 'Karnataka',
sdk_name: 'mixpanel (web)',
sdk_version: '1.0.0',
session_id: '',
});
});
});

View File

@@ -0,0 +1,452 @@
import { randomUUID } from 'node:crypto';
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server';
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
import type { IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { IMixpanelImportConfig } from '@openpanel/validation';
import { z } from 'zod';
import { BaseImportProvider } from '../base-provider';
export const zMixpanelRawEvent = z.object({
event: z.string(),
properties: z.record(z.unknown()),
});
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
provider = 'mixpanel';
version = '1.0.0';
constructor(
private readonly projectId: string,
private readonly config: IMixpanelImportConfig,
private readonly logger?: ILogger,
) {
super();
}
async getTotalEventsCount(): Promise<number> {
// Mixpanel sucks and dont provide a good way to extract total event count within a period
// jql would work but not accurate and will be deprecated end of 2025
return -1;
}
/**
* Mixpanel doesn't provide session IDs, so we need to generate them in SQL
* after all events are imported to ensure deterministic results
*/
shouldGenerateSessionIds(): boolean {
return true;
}
async *parseSource(
overrideFrom?: string,
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
yield* this.fetchEventsFromMixpanel(overrideFrom);
}
private async *fetchEventsFromMixpanel(
overrideFrom?: string,
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
// Split the date range into monthly chunks for reliability
// Uses base class utility to avoid timeout issues with large date ranges
const dateChunks = this.getDateChunks(overrideFrom ?? from, to); // 1 month per chunk
for (const [chunkFrom, chunkTo] of dateChunks) {
yield* this.fetchEventsForDateRange(
serviceAccount,
serviceSecret,
projectId,
chunkFrom,
chunkTo,
);
}
}
private async *fetchEventsForDateRange(
serviceAccount: string,
serviceSecret: string,
projectId: string,
from: string,
to: string,
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
const url = 'https://data.mixpanel.com/api/2.0/export';
const params = new URLSearchParams({
from_date: from,
to_date: to,
project_id: projectId,
});
this.logger?.info('Fetching events from Mixpanel', {
url: `${url}?${params}`,
from,
to,
projectId,
serviceAccount,
});
const response = await fetch(`${url}?${params}`, {
method: 'GET',
headers: {
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
Accept: 'application/json',
},
});
if (!response.ok) {
throw new Error(
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`,
);
}
if (!response.body) {
throw new Error('No response body from Mixpanel API');
}
// Stream the response line by line
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
// Process complete lines
const lines = buffer.split('\n');
buffer = lines.pop() || ''; // Keep the last incomplete line in buffer
for (const line of lines) {
if (line.trim()) {
try {
const event = JSON.parse(line);
yield event;
} catch (error) {
console.warn('Failed to parse Mixpanel event:', line);
}
}
}
}
// Process any remaining line in buffer
if (buffer.trim()) {
try {
const event = JSON.parse(buffer);
yield event;
} catch (error) {
console.warn('Failed to parse final Mixpanel event:', buffer);
}
}
} finally {
reader.releaseLock();
}
}
validate(rawEvent: MixpanelRawEvent): boolean {
const res = zMixpanelRawEvent.safeParse(rawEvent);
return res.success;
}
transformEvent(_rawEvent: MixpanelRawEvent): IClickhouseEvent {
const projectId = this.projectId;
const rawEvent = zMixpanelRawEvent.parse(_rawEvent);
const props = rawEvent.properties as Record<string, any>;
const deviceId = props.$device_id;
const profileId = String(props.$user_id || props.distinct_id).replace(
/^\$device:/,
'',
);
// Build full URL from current_url and current_url_search (web only)
const fullUrl = props.$current_url;
let path = '';
let origin = '';
let hash = '';
let query: Record<string, string> = {};
if (fullUrl) {
const parsed = parsePath(fullUrl);
path = parsed.path || '';
origin = parsed.origin || '';
hash = parsed.hash || '';
query = parsed.query || {};
} else if (this.config.mapScreenViewProperty) {
path = props[this.config.mapScreenViewProperty] || '';
}
// Extract referrer information (web only)
const referrerUrl = props.$initial_referrer || props.$referrer || '';
const referrer =
referrerUrl && !isSameDomain(referrerUrl, fullUrl)
? parseReferrer(referrerUrl)
: null;
// Check for UTM referrer in query params (web only)
const utmReferrer = getReferrerWithQuery(query);
// Extract location data
const country = props.$country || props.mp_country_code || '';
const city = props.$city || '';
const region = props.$region || '';
// For web events, use the standard user agent parsing
const userAgent = props.osVersion || '';
const uaInfo = this.isWebEvent(props.mp_lib)
? parseUserAgent(userAgent, props)
: this.parseServerDeviceInfo(props);
// Map event name - $mp_web_page_view should be screen_view
let eventName = rawEvent.event;
if (eventName === '$mp_web_page_view') {
eventName = 'screen_view';
}
// Build properties object - strip Mixpanel-specific properties
const properties = this.stripMixpanelProperties(props, query);
if (props.$insert_id) {
properties.__source_insert_id = String(props.$insert_id);
}
// Add useful properties
if (props.$screen_width && props.$screen_height) {
properties.__screen = `${props.$screen_width}x${props.$screen_height}`;
}
if (props.$screen_dpi) {
properties.__dpi = props.$screen_dpi;
}
if (props.$language) {
properties.__language = props.$language;
}
if (props.$timezone) {
properties.__timezone = props.$timezone;
}
if (props.$app_version) {
properties.__version = props.$app_version;
}
if (props.$app_build_number) {
properties.__buildNumber = props.$app_build_number;
}
if (props.$lib_version) {
properties.__lib_version = props.$lib_version;
}
if (hash) {
properties.__hash = hash;
}
if (Object.keys(query).length > 0) {
properties.__query = query;
}
if (props.current_page_title) {
properties.__title = props.current_page_title;
}
if (userAgent) {
properties.__userAgent = userAgent;
}
// Always use UUID for id to match ClickHouse UUID column
const event = {
id: randomUUID(),
name: eventName,
device_id: deviceId,
profile_id: profileId,
project_id: projectId,
session_id: '', // Will be generated in SQL after import
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
created_at: new Date(props.time * 1000).toISOString(),
country,
city,
region,
longitude: null,
latitude: null,
os: uaInfo.os || props.$os,
os_version: uaInfo.osVersion || props.$osVersion,
browser: uaInfo.browser || props.$browser,
browser_version:
uaInfo.browserVersion || props.$browserVersion
? String(props.$browser_version)
: '',
device: this.getDeviceType(props.mp_lib, uaInfo, props),
brand: uaInfo.brand || '',
model: uaInfo.model || '',
duration: 0,
path,
origin,
referrer: referrer?.url || '',
referrer_name: utmReferrer?.name || referrer?.name || '',
referrer_type: referrer?.type || utmReferrer?.type || '',
imported_at: new Date().toISOString(),
sdk_name: props.mp_lib
? `${this.provider} (${props.mp_lib})`
: this.provider,
sdk_version: this.version,
};
// TODO: Remove this
// Temporary fix for a client
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
event.name = 'screen_view';
event.path = isMightBeScreenView;
}
// TODO: Remove this
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
if (props.utm_source && !properties.__query?.utm_source) {
const split = decodeURIComponent(props.utm_source).split('&');
const query = Object.fromEntries(split.map((item) => item.split('=')));
for (const [key, value] of Object.entries(query)) {
if (key && value) {
event.properties[`__query.${key}`] = String(value);
} else if (
value === undefined &&
key &&
props.utm_source.startsWith(key)
) {
event.properties['__query.utm_source'] = String(key);
}
}
}
return event;
}
private getDeviceType(
mp_lib: string,
uaInfo: UserAgentInfo,
props: Record<string, any>,
) {
// Normalize lib/os/browser data
const lib = (mp_lib || '').toLowerCase();
const os = String(props.$os || uaInfo.os || '').toLowerCase();
const browser = String(
props.$browser || uaInfo.browser || '',
).toLowerCase();
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
// Strong hint from SDK library
if (['android', 'iphone', 'react-native', 'swift', 'unity'].includes(lib)) {
return isTabletOs ? 'tablet' : 'mobile';
}
// Web or unknown SDKs: infer from OS/Browser
const isMobileSignal =
os === 'ios' ||
os === 'android' ||
browser.includes('mobile safari') ||
browser.includes('chrome ios') ||
browser.includes('android mobile') ||
browser.includes('samsung internet') ||
browser.includes('mobile');
if (isMobileSignal) {
return 'mobile';
}
const isTabletSignal =
isTabletOs ||
browser.includes('tablet') ||
// iPad often reports as Mac OS X with Mobile Safari
(browser.includes('mobile safari') &&
(os === 'mac os x' || os === 'macos'));
if (isTabletSignal) {
return 'tablet';
}
// Default to desktop
return this.isServerEvent(mp_lib) ? 'server' : 'desktop';
}
private isWebEvent(mp_lib: string) {
return [
'web',
'android',
'iphone',
'swift',
'unity',
'react-native',
].includes(mp_lib);
}
private isServerEvent(mp_lib: string) {
return !this.isWebEvent(mp_lib);
}
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
const props = rawEvent.properties as Record<string, any>;
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
}
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
// For mobile events, extract device information from Mixpanel properties
const os = props.$os || props.os || '';
const osVersion = props.$os_version || props.osVersion || '';
const brand = props.$brand || props.phoneBrand || '';
const model = props.$model || props.phoneModel || '';
const device = os.toLowerCase();
return {
isServer: true,
os: os,
osVersion: osVersion,
browser: '',
browserVersion: '',
device: device,
brand: brand,
model: model,
};
}
private stripMixpanelProperties(
properties: Record<string, any>,
searchParams: Record<string, string>,
): Record<string, any> {
const strip = [
'time',
'distinct_id',
'current_page_title',
'current_url_path',
'current_url_protocol',
'current_url_search',
'current_domain',
...Object.keys(searchParams),
];
const filtered = Object.fromEntries(
Object.entries(properties).filter(
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key),
),
);
// Parse JSON strings back to objects/arrays so toDots() can flatten them
const parsed: Record<string, any> = {};
for (const [key, value] of Object.entries(filtered)) {
if (
typeof value === 'string' &&
(value.startsWith('{') || value.startsWith('['))
) {
try {
parsed[key] = JSON.parse(value);
} catch {
parsed[key] = value; // Keep as string if parsing fails
}
} else {
parsed[key] = value;
}
}
return parsed;
}
}

View File

@@ -0,0 +1,382 @@
import { randomUUID } from 'node:crypto';
import { Readable } from 'node:stream';
import { pipeline } from 'node:stream/promises';
import { createBrotliDecompress, createGunzip } from 'node:zlib';
import { isSameDomain, parsePath } from '@openpanel/common';
import { generateDeviceId } from '@openpanel/common/server';
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
import type { IClickhouseEvent } from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { IUmamiImportConfig } from '@openpanel/validation';
import { parse } from 'csv-parse';
import { assocPath } from 'ramda';
import { z } from 'zod';
import { BaseImportProvider } from '../base-provider';
export const zUmamiRawEvent = z.object({
// Required fields
event_type: z.coerce.number(),
event_name: z.string(),
created_at: z.coerce.date(),
event_id: z.string().min(1),
session_id: z.string().min(1),
website_id: z.string().min(1),
// Optional fields that might be empty
visit_id: z.string().optional(),
distinct_id: z.string().optional(),
url_path: z.string().optional(),
hostname: z.string().optional(),
referrer_domain: z.string().optional(),
referrer_path: z.string().optional(),
referrer_query: z.string().optional(),
referrer_name: z.string().optional(),
referrer_type: z.string().optional(),
country: z.string().optional(),
city: z.string().optional(),
region: z.string().optional(),
browser: z.string().optional(),
os: z.string().optional(),
device: z.string().optional(),
screen: z.string().optional(),
language: z.string().optional(),
utm_source: z.string().optional(),
utm_medium: z.string().optional(),
utm_campaign: z.string().optional(),
utm_content: z.string().optional(),
utm_term: z.string().optional(),
page_title: z.string().optional(),
gclid: z.string().optional(),
fbclid: z.string().optional(),
msclkid: z.string().optional(),
ttclid: z.string().optional(),
li_fat_id: z.string().optional(),
twclid: z.string().optional(),
url_query: z.string().optional(),
});
export type UmamiRawEvent = z.infer<typeof zUmamiRawEvent>;
export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
provider = 'umami';
version = '1.0.0';
constructor(
private readonly projectId: string,
private readonly config: IUmamiImportConfig,
private readonly logger?: ILogger,
) {
super();
}
async getTotalEventsCount(): Promise<number> {
return -1;
}
async *parseSource(): AsyncGenerator<UmamiRawEvent, void, unknown> {
yield* this.parseRemoteFile(this.config.fileUrl);
}
private async *parseRemoteFile(
url: string,
opts: {
signal?: AbortSignal;
maxBytes?: number;
maxRows?: number;
} = {},
): AsyncGenerator<UmamiRawEvent, void, unknown> {
const { signal, maxBytes, maxRows } = opts;
const controller = new AbortController();
// Link to caller's signal for cancellation
if (signal) {
signal.addEventListener('abort', () => controller.abort(), {
once: true,
});
}
const res = await fetch(url, { signal: controller.signal });
if (!res.ok || !res.body) {
throw new Error(
`Failed to fetch remote file: ${res.status} ${res.statusText}`,
);
}
const contentType = res.headers.get('content-type') || '';
const contentEnc = res.headers.get('content-encoding') || '';
const contentLen = Number(res.headers.get('content-length') ?? 0);
if (
contentType &&
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
contentType,
)
) {
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
}
if (maxBytes && contentLen && contentLen > maxBytes) {
throw new Error(
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`,
);
}
const looksGzip =
/\.gz($|\?)/i.test(url) ||
/gzip/i.test(contentEnc) ||
/application\/gzip/i.test(contentType);
const looksBr = /br/i.test(contentEnc) || /\.br($|\?)/i.test(url);
// WHATWG -> Node stream
const body = Readable.fromWeb(res.body as any);
// Optional size guard during stream
let seenBytes = 0;
if (maxBytes) {
body.on('data', (chunk: Buffer) => {
seenBytes += chunk.length;
if (seenBytes > maxBytes) {
controller.abort();
body.destroy(
new Error(
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
),
);
}
});
}
// Build decode chain (gzip/brotli -> CSV parser)
const decompress = looksGzip
? createGunzip()
: looksBr
? createBrotliDecompress()
: null;
const parser = parse({
columns: true, // objects per row
bom: true, // handle UTF-8 BOM
relax_column_count: true,
skip_empty_lines: true,
});
// Wire the pipeline for proper backpressure & error propagation
(async () => {
try {
if (decompress) {
await pipeline(body, decompress, parser, {
signal: controller.signal,
});
} else {
await pipeline(body, parser, { signal: controller.signal });
}
} catch (e) {
parser.destroy(e as Error);
}
})().catch(() => {
/* handled by iterator */
});
let rows = 0;
try {
for await (const record of parser) {
rows++;
if (maxRows && rows > maxRows) {
controller.abort();
throw new Error(`Row limit exceeded (${rows} > ${maxRows})`);
}
yield record as UmamiRawEvent;
}
} catch (err) {
throw new Error(
`Failed to parse remote file from ${url}: ${
err instanceof Error ? err.message : String(err)
}`,
);
} finally {
controller.abort(); // ensure fetch stream is torn down
}
}
validate(rawEvent: UmamiRawEvent): boolean {
const res = zUmamiRawEvent.safeParse(rawEvent);
return res.success;
}
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
const projectId =
this.config.projectMapper.find(
(mapper) => mapper.from === _rawEvent.website_id,
)?.to || this.projectId;
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
// Extract device/profile ID - use visit_id as device_id, session_id for session tracking
const deviceId =
rawEvent.visit_id ||
generateDeviceId({
ip: rawEvent.visit_id!,
ua: rawEvent.visit_id!,
origin: projectId,
salt: 'xxx',
});
const profileId = rawEvent.distinct_id || deviceId;
// Parse URL if available - use same logic as real-time events
const url = rawEvent.url_path
? `https://${[rawEvent.hostname, rawEvent.url_path, rawEvent.url_query]
.filter(Boolean)
.join('')}`
: '';
const { path, hash, query, origin } = parsePath(url);
// Extract referrer information - use same logic as real-time events
const referrerUrl = rawEvent.referrer_domain
? `https://${rawEvent.referrer_domain}${rawEvent.referrer_path || ''}`
: '';
// Check if referrer is from same domain (like real-time events do)
const referrer = isSameDomain(referrerUrl, url)
? null
: parseReferrer(referrerUrl);
// Check for UTM referrer in query params (like real-time events do)
const utmReferrer = getReferrerWithQuery(query);
// Extract location data
const country = rawEvent.country || '';
const city = rawEvent.city || '';
const region = rawEvent.region || '';
// Extract browser/device info
const browser = rawEvent.browser || '';
const browserVersion = ''; // Not available in Umami CSV
const os = rawEvent.os || '';
const osVersion = ''; // Not available in Umami CSV
const device = rawEvent.device || '';
const brand = ''; // Not available in Umami CSV
const model = ''; // Not available in Umami CSV
let properties: Record<string, any> = {};
if (query) {
properties.__query = query;
}
// Add useful properties from Umami data
if (rawEvent.page_title) properties.__title = rawEvent.page_title;
if (rawEvent.screen) properties.__screen = rawEvent.screen;
if (rawEvent.language) properties.__language = rawEvent.language;
if (rawEvent.utm_source)
properties = assocPath(
['__query', 'utm_source'],
rawEvent.utm_source,
properties,
);
if (rawEvent.utm_medium)
properties = assocPath(
['__query', 'utm_medium'],
rawEvent.utm_medium,
properties,
);
if (rawEvent.utm_campaign)
properties = assocPath(
['__query', 'utm_campaign'],
rawEvent.utm_campaign,
properties,
);
if (rawEvent.utm_content)
properties = assocPath(
['__query', 'utm_content'],
rawEvent.utm_content,
properties,
);
if (rawEvent.utm_term)
properties = assocPath(
['__query', 'utm_term'],
rawEvent.utm_term,
properties,
);
return {
id: rawEvent.event_id || randomUUID(),
name: rawEvent.event_type === 1 ? 'screen_view' : rawEvent.event_name,
device_id: deviceId,
profile_id: profileId,
project_id: projectId,
session_id: rawEvent.session_id || '',
properties,
created_at: rawEvent.created_at.toISOString(),
country,
city,
region: this.mapRegion(region),
longitude: null,
latitude: null,
os,
os_version: osVersion,
browser: this.mapBrowser(browser),
browser_version: browserVersion,
device: this.mapDevice(device),
brand,
model,
duration: 0,
path,
origin,
referrer: utmReferrer?.url || referrer?.url || '',
referrer_name: utmReferrer?.name || referrer?.name || '',
referrer_type: utmReferrer?.type || referrer?.type || '',
imported_at: new Date().toISOString(),
sdk_name: this.provider,
sdk_version: this.version,
};
}
mapRegion(region: string): string {
return region.replace(/^[A-Z]{2}\-/, '');
}
mapDevice(device: string): string {
const mapping: Record<string, string> = {
desktop: 'desktop',
laptop: 'desktop',
mobile: 'mobile',
tablet: 'tablet',
smarttv: 'smarttv',
Unknown: 'desktop',
};
return mapping[device] || 'desktop';
}
mapBrowser(browser: string): string {
const mapping: Record<string, string> = {
android: 'Android',
aol: 'AOL',
bb10: 'BlackBerry 10',
beaker: 'Beaker',
chrome: 'Chrome',
'chromium-webview': 'Chrome (webview)',
crios: 'Chrome (iOS)',
curl: 'Curl',
edge: 'Edge',
'edge-chromium': 'Edge (Chromium)',
'edge-ios': 'Edge (iOS)',
facebook: 'Facebook',
firefox: 'Firefox',
fxios: 'Firefox (iOS)',
ie: 'IE',
instagram: 'Instagram',
ios: 'iOS',
'ios-webview': 'iOS (webview)',
kakaotalk: 'KakaoTalk',
miui: 'MIUI',
opera: 'Opera',
'opera-mini': 'Opera Mini',
phantomjs: 'PhantomJS',
safari: 'Safari',
samsung: 'Samsung',
searchbot: 'Searchbot',
silk: 'Silk',
yandexbrowser: 'Yandex',
};
return mapping[browser] || browser || 'Unknown';
}
}

View File

@@ -0,0 +1,80 @@
import type {
IImportedEvent,
IServiceCreateEventPayload,
IServiceImportedEventPayload,
} from '@openpanel/db';
export interface ImportConfig {
projectId: string;
provider: string;
sourceType: 'file' | 'api';
sourceLocation: string;
}
export interface SessionInfo {
id: string;
lastTimestamp: number;
lastEvent: IServiceImportedEventPayload;
}
export interface ImportProgress {
totalEvents: number;
processedEvents: number;
currentBatch: number;
totalBatches: number;
}
export interface ImportResult {
success: boolean;
totalEvents: number;
processedEvents: number;
error?: string;
}
export interface BatchResult {
events: IServiceImportedEventPayload[];
sessionEvents: IServiceImportedEventPayload[];
}
// Generic types for raw events from different providers
export interface BaseRawEvent {
[key: string]: unknown;
}
// Error context for better error handling
export interface ErrorContext {
batchNumber?: number;
batchSize?: number;
eventIndex?: number;
rawEvent?: BaseRawEvent;
provider?: string;
}
// Properties type for events - more specific than Record<string, any>
export interface EventProperties {
[key: string]:
| string
| number
| boolean
| null
| undefined
| Record<string, unknown>;
__query?: Record<string, unknown>;
__title?: string;
__screen?: string;
__language?: string;
}
// Import job metadata for tracking import progress
export interface ImportJobMetadata {
importId: string;
importStatus: 'pending' | 'processing' | 'processed' | 'failed';
importedAt: Date;
}
// Result of import staging operations
export interface ImportStageResult {
importId: string;
totalEvents: number;
insertedEvents: number;
}

View File

@@ -0,0 +1,9 @@
{
"extends": "../../tooling/typescript/base.json",
"compilerOptions": {
"outDir": "./dist",
"rootDir": "./src"
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}

View File

@@ -0,0 +1,3 @@
import { getSharedVitestConfig } from '../../vitest.shared';
export default getSharedVitestConfig({ __dirname });

View File

@@ -110,7 +110,6 @@ export const eventsGroupQueue = new GroupQueue<
>({
logger: queueLogger,
namespace: 'group_events',
// @ts-expect-error - TODO: Fix this in groupmq
redis: getRedisGroupQueue(),
orderingMethod: 'in-memory',
orderingWindowMs,
@@ -166,6 +165,21 @@ export const notificationQueue = new Queue<NotificationQueuePayload>(
},
);
export type ImportQueuePayload = {
type: 'import';
payload: {
importId: string;
};
};
export const importQueue = new Queue<ImportQueuePayload>('import', {
connection: getRedisQueue(),
defaultJobOptions: {
removeOnComplete: 10,
removeOnFail: 50,
},
});
export function addTrialEndingSoonJob(organizationId: string, delay: number) {
return miscQueue.add(
'misc',

View File

@@ -8,7 +8,7 @@
},
"dependencies": {
"@openpanel/json": "workspace:*",
"ioredis": "^5.7.0"
"ioredis": "5.8.2"
},
"devDependencies": {
"@openpanel/db": "workspace:*",

View File

@@ -16,6 +16,7 @@
"@openpanel/payments": "workspace:^",
"@openpanel/redis": "workspace:*",
"@openpanel/validation": "workspace:*",
"@openpanel/queue": "workspace:*",
"@trpc-limiter/redis": "^0.0.2",
"@trpc/client": "^11.6.0",
"@trpc/server": "^11.6.0",

View File

@@ -4,6 +4,7 @@ import { chatRouter } from './routers/chat';
import { clientRouter } from './routers/client';
import { dashboardRouter } from './routers/dashboard';
import { eventRouter } from './routers/event';
import { importRouter } from './routers/import';
import { integrationRouter } from './routers/integration';
import { notificationRouter } from './routers/notification';
import { onboardingRouter } from './routers/onboarding';
@@ -40,6 +41,7 @@ export const appRouter = createTRPCRouter({
reference: referenceRouter,
notification: notificationRouter,
integration: integrationRouter,
import: importRouter,
auth: authRouter,
subscription: subscriptionRouter,
overview: overviewRouter,

View File

@@ -12,7 +12,7 @@ import {
validateSessionToken,
verifyPasswordHash,
} from '@openpanel/auth';
import { generateSecureId } from '@openpanel/common/server/id';
import { generateSecureId } from '@openpanel/common/server';
import {
connectUserToOrganization,
db,

View File

@@ -0,0 +1,178 @@
import { z } from 'zod';
import { db } from '@openpanel/db';
import { importQueue } from '@openpanel/queue';
import { zCreateImport } from '@openpanel/validation';
import { getProjectAccess } from '../access';
import { TRPCAccessError } from '../errors';
import { createTRPCRouter, protectedProcedure } from '../trpc';
export const importRouter = createTRPCRouter({
list: protectedProcedure
.input(z.object({ projectId: z.string() }))
.query(async ({ input, ctx }) => {
const access = await getProjectAccess({
projectId: input.projectId,
userId: ctx.session.userId,
});
if (!access) {
throw TRPCAccessError('You do not have access to this project');
}
return db.import.findMany({
where: {
projectId: input.projectId,
},
orderBy: {
createdAt: 'desc',
},
});
}),
get: protectedProcedure
.input(z.object({ id: z.string() }))
.query(async ({ input, ctx }) => {
const importRecord = await db.import.findUniqueOrThrow({
where: {
id: input.id,
},
include: {
project: true,
},
});
const access = await getProjectAccess({
projectId: importRecord.projectId,
userId: ctx.session.userId,
});
if (!access) {
throw TRPCAccessError('You do not have access to this import');
}
return importRecord;
}),
create: protectedProcedure
.input(zCreateImport)
.mutation(async ({ input, ctx }) => {
const access = await getProjectAccess({
projectId: input.projectId,
userId: ctx.session.userId,
});
if (!access || (typeof access !== 'boolean' && access.level === 'read')) {
throw TRPCAccessError(
'You do not have permission to create imports for this project',
);
}
// Create import record
const importRecord = await db.import.create({
data: {
projectId: input.projectId,
config: input.config,
status: 'pending',
},
});
// Add job to queue
const job = await importQueue.add('import', {
type: 'import',
payload: {
importId: importRecord.id,
},
});
// Update import record with job ID
await db.import.update({
where: { id: importRecord.id },
data: { jobId: job.id },
});
return {
...importRecord,
jobId: job.id,
};
}),
delete: protectedProcedure
.input(z.object({ id: z.string() }))
.mutation(async ({ input, ctx }) => {
const importRecord = await db.import.findUniqueOrThrow({
where: {
id: input.id,
},
});
const access = await getProjectAccess({
projectId: importRecord.projectId,
userId: ctx.session.userId,
});
if (!access || (typeof access !== 'boolean' && access.level === 'read')) {
throw TRPCAccessError(
'You do not have permission to delete imports for this project',
);
}
if (importRecord.jobId) {
const job = await importQueue.getJob(importRecord.jobId);
if (job) {
await job.remove();
}
}
return db.import.delete({
where: {
id: input.id,
},
});
}),
retry: protectedProcedure
.input(z.object({ id: z.string() }))
.mutation(async ({ input, ctx }) => {
const importRecord = await db.import.findUniqueOrThrow({
where: {
id: input.id,
},
});
const access = await getProjectAccess({
projectId: importRecord.projectId,
userId: ctx.session.userId,
});
if (!access || (typeof access !== 'boolean' && access.level === 'read')) {
throw TRPCAccessError(
'You do not have permission to retry imports for this project',
);
}
// Only allow retry for failed imports
if (importRecord.status !== 'failed') {
throw new Error('Only failed imports can be retried');
}
// Add new job to queue
const job = await importQueue.add('import', {
type: 'import',
payload: {
importId: importRecord.id,
},
});
// Update import record
return db.import.update({
where: { id: importRecord.id },
data: {
jobId: job.id,
status: 'pending',
errorMessage: null,
},
});
}),
});

View File

@@ -11,7 +11,7 @@ import {
} from '@openpanel/db';
import { zEditOrganization, zInviteUser } from '@openpanel/validation';
import { generateSecureId } from '@openpanel/common/server/id';
import { generateSecureId } from '@openpanel/common/server';
import { sendEmail } from '@openpanel/email';
import { addDays } from 'date-fns';
import { getOrganizationAccess } from '../access';

View File

@@ -435,3 +435,54 @@ export const zEditOrganization = z.object({
name: z.string().min(2),
timezone: z.string().min(1),
});
const zProjectMapper = z.object({
from: z.string().min(1),
to: z.string().min(1),
});
const createFileImportConfig = <T extends string>(provider: T) =>
z.object({
provider: z.literal(provider),
type: z.literal('file'),
fileUrl: z.string().url(),
});
// Import configs
export const zUmamiImportConfig = createFileImportConfig('umami').extend({
projectMapper: z.array(zProjectMapper),
});
export type IUmamiImportConfig = z.infer<typeof zUmamiImportConfig>;
export const zPlausibleImportConfig = createFileImportConfig('plausible');
export type IPlausibleImportConfig = z.infer<typeof zPlausibleImportConfig>;
export const zMixpanelImportConfig = z.object({
provider: z.literal('mixpanel'),
type: z.literal('api'),
serviceAccount: z.string().min(1),
serviceSecret: z.string().min(1),
projectId: z.string().min(1),
from: z.string().min(1),
to: z.string().min(1),
mapScreenViewProperty: z.string().optional(),
});
export type IMixpanelImportConfig = z.infer<typeof zMixpanelImportConfig>;
export type IImportConfig =
| IUmamiImportConfig
| IPlausibleImportConfig
| IMixpanelImportConfig;
export const zCreateImport = z.object({
projectId: z.string().min(1),
provider: z.enum(['umami', 'plausible', 'mixpanel']),
config: z.union([
zUmamiImportConfig,
zPlausibleImportConfig,
zMixpanelImportConfig,
]),
});
export type ICreateImport = z.infer<typeof zCreateImport>;