feat: new importer (#214)
This commit is contained in:
committed by
GitHub
parent
b51bc8f3f6
commit
212254d31a
@@ -1,33 +0,0 @@
|
||||
{
|
||||
"name": "@openpanel/cli",
|
||||
"version": "0.0.1-beta",
|
||||
"type": "module",
|
||||
"module": "index.ts",
|
||||
"bin": {
|
||||
"openpanel": "dist/bin/cli.js"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "rm -rf dist && tsup",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"@openpanel/common": "workspace:*",
|
||||
"arg": "^5.0.2",
|
||||
"glob": "^10.4.3",
|
||||
"inquirer": "^9.3.5",
|
||||
"p-limit": "^6.1.0",
|
||||
"progress": "^2.0.3",
|
||||
"ramda": "^0.29.1",
|
||||
"zod": "catalog:"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@openpanel/db": "workspace:^",
|
||||
"@openpanel/sdk": "workspace:*",
|
||||
"@openpanel/tsconfig": "workspace:*",
|
||||
"@types/node": "catalog:",
|
||||
"@types/progress": "^2.0.7",
|
||||
"@types/ramda": "^0.30.1",
|
||||
"tsup": "^7.2.0",
|
||||
"typescript": "catalog:"
|
||||
}
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
import arg from 'arg';
|
||||
|
||||
import importer from './importer';
|
||||
|
||||
function cli() {
|
||||
const args = arg(
|
||||
{
|
||||
'--help': Boolean,
|
||||
},
|
||||
{
|
||||
permissive: true,
|
||||
},
|
||||
);
|
||||
|
||||
const [command] = args._;
|
||||
|
||||
switch (command) {
|
||||
case 'import': {
|
||||
return importer();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cli();
|
||||
@@ -1,467 +0,0 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import fs from 'node:fs';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import readline from 'node:readline';
|
||||
import zlib from 'node:zlib';
|
||||
import Progress from 'progress';
|
||||
import { assocPath, prop, uniqBy } from 'ramda';
|
||||
|
||||
import { isSameDomain, parsePath } from '@openpanel/common';
|
||||
import type { IImportedEvent } from '@openpanel/db';
|
||||
|
||||
const BATCH_SIZE = 30_000;
|
||||
const SLEEP_TIME = 20;
|
||||
const MAX_CONCURRENT_REQUESTS = 8;
|
||||
|
||||
type IMixpanelEvent = {
|
||||
event: string;
|
||||
properties: {
|
||||
[key: string]: unknown;
|
||||
time: number;
|
||||
$current_url?: string;
|
||||
distinct_id?: string;
|
||||
$device_id?: string;
|
||||
country_code?: string;
|
||||
$region?: string;
|
||||
$city?: string;
|
||||
$os?: string;
|
||||
$browser?: string;
|
||||
$browser_version?: string;
|
||||
$initial_referrer?: string;
|
||||
$search_engine?: string;
|
||||
};
|
||||
};
|
||||
|
||||
function stripMixpanelProperties(obj: Record<string, unknown>) {
|
||||
return Object.fromEntries(
|
||||
Object.entries(obj).filter(
|
||||
([key]) =>
|
||||
!key.match(/^(\$|mp_)/) && !['time', 'distinct_id'].includes(key),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
async function* parseJsonStream(
|
||||
fileStream: fs.ReadStream,
|
||||
): AsyncGenerator<any, void, unknown> {
|
||||
const rl = readline.createInterface({
|
||||
input: fileStream,
|
||||
crlfDelay: Number.POSITIVE_INFINITY,
|
||||
});
|
||||
|
||||
let buffer = '';
|
||||
let bracketCount = 0;
|
||||
|
||||
for await (const line of rl) {
|
||||
buffer += line;
|
||||
bracketCount +=
|
||||
(line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
||||
|
||||
if (bracketCount === 0 && buffer.trim()) {
|
||||
try {
|
||||
const json = JSON.parse(buffer);
|
||||
yield json;
|
||||
} catch (error) {
|
||||
console.log('Warning: Failed to parse JSON');
|
||||
console.log('Buffer:', buffer);
|
||||
}
|
||||
buffer = '';
|
||||
}
|
||||
}
|
||||
|
||||
if (buffer.trim()) {
|
||||
try {
|
||||
const json = JSON.parse(buffer);
|
||||
yield json;
|
||||
} catch (error) {
|
||||
console.log('Warning: Failed to parse remaining JSON');
|
||||
console.log('Buffer:', buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
interface Session {
|
||||
start: number;
|
||||
end: number;
|
||||
profileId?: string;
|
||||
deviceId?: string;
|
||||
sessionId: string;
|
||||
firstEvent?: IImportedEvent;
|
||||
lastEvent?: IImportedEvent;
|
||||
events: IImportedEvent[];
|
||||
}
|
||||
|
||||
function generateSessionEvents(events: IImportedEvent[]): Session[] {
|
||||
let sessionList: Session[] = [];
|
||||
const lastSessionByDevice: Record<string, Session> = {};
|
||||
const lastSessionByProfile: Record<string, Session> = {};
|
||||
const thirtyMinutes = 30 * 60 * 1000;
|
||||
|
||||
events.sort(
|
||||
(a, b) =>
|
||||
new Date(a.created_at).getTime() - new Date(b.created_at).getTime(),
|
||||
);
|
||||
|
||||
for (const event of events) {
|
||||
const eventTime = new Date(event.created_at).getTime();
|
||||
let deviceSession = event.device_id
|
||||
? lastSessionByDevice[event.device_id]
|
||||
: undefined;
|
||||
let profileSession = event.profile_id
|
||||
? lastSessionByProfile[event.profile_id]
|
||||
: undefined;
|
||||
|
||||
if (
|
||||
event.device_id &&
|
||||
event.device_id !== event.profile_id &&
|
||||
(!deviceSession || eventTime > deviceSession.end + thirtyMinutes)
|
||||
) {
|
||||
deviceSession = {
|
||||
start: eventTime,
|
||||
end: eventTime,
|
||||
deviceId: event.device_id,
|
||||
sessionId: randomUUID(),
|
||||
firstEvent: event,
|
||||
events: [event],
|
||||
};
|
||||
lastSessionByDevice[event.device_id] = deviceSession;
|
||||
sessionList.push(deviceSession);
|
||||
} else if (deviceSession) {
|
||||
deviceSession.end = eventTime;
|
||||
deviceSession.lastEvent = event;
|
||||
deviceSession.events.push(event);
|
||||
}
|
||||
|
||||
if (
|
||||
event.profile_id &&
|
||||
event.device_id !== event.profile_id &&
|
||||
(!profileSession || eventTime > profileSession.end + thirtyMinutes)
|
||||
) {
|
||||
profileSession = {
|
||||
start: eventTime,
|
||||
end: eventTime,
|
||||
profileId: event.profile_id,
|
||||
sessionId: randomUUID(),
|
||||
firstEvent: event,
|
||||
events: [event],
|
||||
};
|
||||
lastSessionByProfile[event.profile_id] = profileSession;
|
||||
sessionList.push(profileSession);
|
||||
} else if (profileSession) {
|
||||
profileSession.end = eventTime;
|
||||
profileSession.lastEvent = event;
|
||||
profileSession.events.push(event);
|
||||
}
|
||||
|
||||
if (
|
||||
deviceSession &&
|
||||
profileSession &&
|
||||
deviceSession.sessionId !== profileSession.sessionId
|
||||
) {
|
||||
const unifiedSession = {
|
||||
...deviceSession,
|
||||
...profileSession,
|
||||
events: [...deviceSession.events, ...profileSession.events],
|
||||
start: Math.min(deviceSession.start, profileSession.start),
|
||||
end: Math.max(deviceSession.end, profileSession.end),
|
||||
sessionId: deviceSession.sessionId,
|
||||
};
|
||||
lastSessionByDevice[event.device_id] = unifiedSession;
|
||||
lastSessionByProfile[event.profile_id] = unifiedSession;
|
||||
sessionList = sessionList.filter(
|
||||
(session) =>
|
||||
session.sessionId !== deviceSession?.sessionId &&
|
||||
session.sessionId !== profileSession?.sessionId,
|
||||
);
|
||||
sessionList.push(unifiedSession);
|
||||
}
|
||||
}
|
||||
|
||||
return sessionList;
|
||||
}
|
||||
|
||||
function createEventObject(event: IMixpanelEvent): IImportedEvent {
|
||||
const getReferrer = (referrer: string | undefined) => {
|
||||
if (!referrer) {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (referrer === '$direct') {
|
||||
return '';
|
||||
}
|
||||
|
||||
if (isSameDomain(referrer, event.properties.$current_url)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return referrer;
|
||||
};
|
||||
const url = parsePath(event.properties.$current_url);
|
||||
return {
|
||||
profile_id: event.properties.distinct_id
|
||||
? String(event.properties.distinct_id).replace(/^\$device:/, '')
|
||||
: (event.properties.$device_id ?? ''),
|
||||
name: event.event,
|
||||
created_at: new Date(event.properties.time * 1000).toISOString(),
|
||||
properties: {
|
||||
...stripMixpanelProperties(event.properties),
|
||||
...(event.properties.$current_url
|
||||
? {
|
||||
__query: url.query,
|
||||
__hash: url.hash,
|
||||
}
|
||||
: {}),
|
||||
},
|
||||
country: event.properties.country_code ?? '',
|
||||
region: event.properties.$region ?? '',
|
||||
city: event.properties.$city ?? '',
|
||||
os: event.properties.$os ?? '',
|
||||
browser: event.properties.$browser ?? '',
|
||||
browser_version: event.properties.$browser_version
|
||||
? String(event.properties.$browser_version)
|
||||
: '',
|
||||
referrer: getReferrer(event.properties.$initial_referrer),
|
||||
referrer_type: event.properties.$search_engine ? 'search' : '',
|
||||
referrer_name: event.properties.$search_engine ?? '',
|
||||
device_id: event.properties.$device_id ?? '',
|
||||
session_id: '',
|
||||
project_id: '',
|
||||
path: url.path,
|
||||
origin: url.origin,
|
||||
os_version: '',
|
||||
model: '',
|
||||
longitude: null,
|
||||
latitude: null,
|
||||
id: randomUUID(),
|
||||
duration: 0,
|
||||
device: event.properties.$current_url ? '' : 'server',
|
||||
brand: '',
|
||||
sdk_name: '',
|
||||
sdk_version: '',
|
||||
};
|
||||
}
|
||||
|
||||
function isMixpanelEvent(event: any): event is IMixpanelEvent {
|
||||
return (
|
||||
typeof event === 'object' &&
|
||||
event !== null &&
|
||||
typeof event?.event === 'string' &&
|
||||
typeof event?.properties === 'object' &&
|
||||
event?.properties !== null &&
|
||||
typeof event?.properties.time === 'number'
|
||||
);
|
||||
}
|
||||
|
||||
async function processFile(file: string): Promise<IImportedEvent[]> {
|
||||
const fileStream = fs.createReadStream(file);
|
||||
const events: IImportedEvent[] = [];
|
||||
for await (const event of parseJsonStream(fileStream)) {
|
||||
if (Array.isArray(event)) {
|
||||
for (const item of event) {
|
||||
if (isMixpanelEvent(item)) {
|
||||
events.push(createEventObject(item));
|
||||
} else {
|
||||
console.log('Not a Mixpanel event', item);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (isMixpanelEvent(event)) {
|
||||
events.push(createEventObject(event));
|
||||
} else {
|
||||
console.log('Not a Mixpanel event', event);
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
function processEvents(events: IImportedEvent[]): IImportedEvent[] {
|
||||
const sessions = generateSessionEvents(events);
|
||||
const processedEvents = sessions.flatMap((session) =>
|
||||
[
|
||||
session.firstEvent && {
|
||||
...session.firstEvent,
|
||||
id: randomUUID(),
|
||||
created_at: new Date(
|
||||
new Date(session.firstEvent.created_at).getTime() - 1000,
|
||||
).toISOString(),
|
||||
session_id: session.sessionId,
|
||||
name: 'session_start',
|
||||
},
|
||||
...uniqBy(
|
||||
prop('id'),
|
||||
session.events.map((event) =>
|
||||
assocPath(['session_id'], session.sessionId, event),
|
||||
),
|
||||
),
|
||||
session.lastEvent && {
|
||||
...session.lastEvent,
|
||||
id: randomUUID(),
|
||||
created_at: new Date(
|
||||
new Date(session.lastEvent.created_at).getTime() + 1000,
|
||||
).toISOString(),
|
||||
session_id: session.sessionId,
|
||||
name: 'session_end',
|
||||
},
|
||||
].filter((item): item is IImportedEvent => !!item),
|
||||
);
|
||||
|
||||
return [
|
||||
...processedEvents,
|
||||
...events.filter((event) => {
|
||||
return !event.profile_id && !event.device_id;
|
||||
}),
|
||||
];
|
||||
}
|
||||
|
||||
async function sendBatchToAPI(
|
||||
batch: IImportedEvent[],
|
||||
{
|
||||
apiUrl,
|
||||
clientId,
|
||||
clientSecret,
|
||||
}: {
|
||||
apiUrl: string;
|
||||
clientId: string;
|
||||
clientSecret: string;
|
||||
},
|
||||
) {
|
||||
async function request() {
|
||||
const res = await fetch(`${apiUrl}/import/events`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Encoding': 'gzip',
|
||||
'Content-Type': 'application/json',
|
||||
'openpanel-client-id': clientId,
|
||||
'openpanel-client-secret': clientSecret,
|
||||
},
|
||||
body: Buffer.from(zlib.gzipSync(JSON.stringify(batch))),
|
||||
});
|
||||
if (!res.ok) {
|
||||
throw new Error(`Failed to send batch: ${await res.text()}`);
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, SLEEP_TIME));
|
||||
}
|
||||
|
||||
try {
|
||||
await request();
|
||||
} catch (e) {
|
||||
console.log('Error sending batch, retrying...');
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
try {
|
||||
await request();
|
||||
} catch (e) {
|
||||
console.log('Error sending batch, skipping...');
|
||||
fs.writeFileSync(
|
||||
path.join(
|
||||
os.tmpdir(),
|
||||
`openpanel/failed-import-batch-${batch[0]?.created_at ? new Date(batch[0]?.created_at).toISOString() : Date.now()}.json`,
|
||||
),
|
||||
JSON.stringify(batch, null, 2),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function processFiles({
|
||||
files,
|
||||
apiUrl,
|
||||
clientId,
|
||||
clientSecret,
|
||||
}: {
|
||||
files: string[];
|
||||
apiUrl: string;
|
||||
clientId: string;
|
||||
clientSecret: string;
|
||||
}) {
|
||||
const progress = new Progress(
|
||||
'Processing (:current/:total) :file [:bar] :percent | :savedEvents saved events | :status',
|
||||
{
|
||||
total: files.length,
|
||||
width: 20,
|
||||
},
|
||||
);
|
||||
let savedEvents = 0;
|
||||
let currentBatch: IImportedEvent[] = [];
|
||||
let apiBatching = [];
|
||||
|
||||
for (const file of files) {
|
||||
progress.tick({
|
||||
file,
|
||||
savedEvents,
|
||||
status: 'reading file',
|
||||
});
|
||||
const events = await processFile(file);
|
||||
progress.render({
|
||||
file,
|
||||
savedEvents,
|
||||
status: 'processing events',
|
||||
});
|
||||
const processedEvents = processEvents(events);
|
||||
for (const event of processedEvents) {
|
||||
currentBatch.push(event);
|
||||
if (currentBatch.length >= BATCH_SIZE) {
|
||||
apiBatching.push(currentBatch);
|
||||
savedEvents += currentBatch.length;
|
||||
progress.render({ file, savedEvents, status: 'saving events' });
|
||||
currentBatch = [];
|
||||
}
|
||||
|
||||
if (apiBatching.length >= MAX_CONCURRENT_REQUESTS) {
|
||||
await Promise.all(
|
||||
apiBatching.map((batch) =>
|
||||
sendBatchToAPI(batch, {
|
||||
apiUrl,
|
||||
clientId,
|
||||
clientSecret,
|
||||
}),
|
||||
),
|
||||
);
|
||||
apiBatching = [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentBatch.length > 0) {
|
||||
await sendBatchToAPI(currentBatch, {
|
||||
apiUrl,
|
||||
clientId,
|
||||
clientSecret,
|
||||
});
|
||||
savedEvents += currentBatch.length;
|
||||
progress.render({ file: 'Complete', savedEvents, status: 'Complete' });
|
||||
}
|
||||
}
|
||||
|
||||
export async function importFiles({
|
||||
files,
|
||||
apiUrl,
|
||||
clientId,
|
||||
clientSecret,
|
||||
}: {
|
||||
files: string[];
|
||||
apiUrl: string;
|
||||
clientId: string;
|
||||
clientSecret: string;
|
||||
}) {
|
||||
if (files.length === 0) {
|
||||
console.log('No files found');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${files.length} files to process`);
|
||||
|
||||
const startTime = Date.now();
|
||||
await processFiles({
|
||||
files,
|
||||
apiUrl,
|
||||
clientId,
|
||||
clientSecret,
|
||||
});
|
||||
const endTime = Date.now();
|
||||
|
||||
console.log(
|
||||
`\nProcessing completed in ${(endTime - startTime) / 1000} seconds`,
|
||||
);
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
import path from 'node:path';
|
||||
import arg from 'arg';
|
||||
import { glob } from 'glob';
|
||||
|
||||
import { importFiles } from './importer';
|
||||
|
||||
export default async function importer() {
|
||||
const args = arg(
|
||||
{
|
||||
'--glob': String,
|
||||
'--api-url': String,
|
||||
'--client-id': String,
|
||||
'--client-secret': String,
|
||||
'--dry-run': Boolean,
|
||||
'--from': Number,
|
||||
'--to': Number,
|
||||
},
|
||||
{
|
||||
permissive: true,
|
||||
},
|
||||
);
|
||||
|
||||
if (!args['--glob']) {
|
||||
throw new Error('Missing --glob argument');
|
||||
}
|
||||
|
||||
if (!args['--client-id']) {
|
||||
throw new Error('Missing --client-id argument');
|
||||
}
|
||||
|
||||
if (!args['--client-secret']) {
|
||||
throw new Error('Missing --client-secret argument');
|
||||
}
|
||||
|
||||
const cwd = process.cwd();
|
||||
|
||||
const fileMatcher = path.resolve(cwd, args['--glob']);
|
||||
const allFiles = await glob([fileMatcher], { root: '/' });
|
||||
allFiles.sort((a, b) => a.localeCompare(b));
|
||||
|
||||
const files = allFiles.slice(
|
||||
args['--from'] ?? 0,
|
||||
args['--to'] ?? Number.MAX_SAFE_INTEGER,
|
||||
);
|
||||
|
||||
if (args['--dry-run']) {
|
||||
files.forEach((file, index) => {
|
||||
console.log(`Would import (index: ${index}): ${file}`);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
return importFiles({
|
||||
files,
|
||||
clientId: args['--client-id'],
|
||||
clientSecret: args['--client-secret'],
|
||||
apiUrl: args['--api-url'] ?? 'https://api.openpanel.dev',
|
||||
});
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"extends": "@openpanel/tsconfig/base.json",
|
||||
"compilerOptions": {
|
||||
"incremental": false,
|
||||
"outDir": "dist"
|
||||
},
|
||||
"exclude": ["dist"]
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
import { defineConfig } from 'tsup';
|
||||
|
||||
export default defineConfig({
|
||||
entry: ['src/cli.ts'],
|
||||
format: ['cjs', 'esm'],
|
||||
dts: true,
|
||||
splitting: false,
|
||||
sourcemap: false,
|
||||
clean: true,
|
||||
minify: true,
|
||||
});
|
||||
@@ -3,9 +3,14 @@
|
||||
"version": "0.0.1",
|
||||
"type": "module",
|
||||
"main": "index.ts",
|
||||
"exports": {
|
||||
".": "./index.ts",
|
||||
"./server": "./server/index.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "vitest",
|
||||
"typecheck": "tsc --noEmit"
|
||||
"typecheck": "tsc --noEmit",
|
||||
"gen:referrers": "jiti scripts/get-referrers.ts && biome format --write ./server/referrers/index.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@openpanel/constants": "workspace:*",
|
||||
|
||||
96
packages/common/scripts/get-referrers.ts
Normal file
96
packages/common/scripts/get-referrers.ts
Normal file
@@ -0,0 +1,96 @@
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
// extras
|
||||
const extraReferrers = {
|
||||
'zoom.us': { type: 'social', name: 'Zoom' },
|
||||
'apple.com': { type: 'tech', name: 'Apple' },
|
||||
'adobe.com': { type: 'tech', name: 'Adobe' },
|
||||
'figma.com': { type: 'tech', name: 'Figma' },
|
||||
'wix.com': { type: 'commerce', name: 'Wix' },
|
||||
'gmail.com': { type: 'email', name: 'Gmail' },
|
||||
'notion.so': { type: 'tech', name: 'Notion' },
|
||||
'ebay.com': { type: 'commerce', name: 'eBay' },
|
||||
'github.com': { type: 'tech', name: 'GitHub' },
|
||||
'gitlab.com': { type: 'tech', name: 'GitLab' },
|
||||
'slack.com': { type: 'social', name: 'Slack' },
|
||||
'etsy.com': { type: 'commerce', name: 'Etsy' },
|
||||
'bsky.app': { type: 'social', name: 'Bluesky' },
|
||||
'twitch.tv': { type: 'content', name: 'Twitch' },
|
||||
'dropbox.com': { type: 'tech', name: 'Dropbox' },
|
||||
'outlook.com': { type: 'email', name: 'Outlook' },
|
||||
'medium.com': { type: 'content', name: 'Medium' },
|
||||
'paypal.com': { type: 'commerce', name: 'PayPal' },
|
||||
'discord.com': { type: 'social', name: 'Discord' },
|
||||
'stripe.com': { type: 'commerce', name: 'Stripe' },
|
||||
'spotify.com': { type: 'content', name: 'Spotify' },
|
||||
'netflix.com': { type: 'content', name: 'Netflix' },
|
||||
'whatsapp.com': { type: 'social', name: 'WhatsApp' },
|
||||
'shopify.com': { type: 'commerce', name: 'Shopify' },
|
||||
'microsoft.com': { type: 'tech', name: 'Microsoft' },
|
||||
'alibaba.com': { type: 'commerce', name: 'Alibaba' },
|
||||
'telegram.org': { type: 'social', name: 'Telegram' },
|
||||
'substack.com': { type: 'content', name: 'Substack' },
|
||||
'salesforce.com': { type: 'tech', name: 'Salesforce' },
|
||||
'instagram.com': { type: 'social', name: 'Instagram' },
|
||||
'wikipedia.org': { type: 'content', name: 'Wikipedia' },
|
||||
'mastodon.social': { type: 'social', name: 'Mastodon' },
|
||||
'office.com': { type: 'tech', name: 'Microsoft Office' },
|
||||
'squarespace.com': { type: 'commerce', name: 'Squarespace' },
|
||||
'stackoverflow.com': { type: 'tech', name: 'Stack Overflow' },
|
||||
'teams.microsoft.com': { type: 'social', name: 'Microsoft Teams' },
|
||||
};
|
||||
|
||||
function transform(data: any) {
|
||||
const obj: Record<string, unknown> = {};
|
||||
for (const type in data) {
|
||||
for (const name in data[type]) {
|
||||
const domains = data[type][name].domains ?? [];
|
||||
for (const domain of domains) {
|
||||
obj[domain] = {
|
||||
type,
|
||||
name,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return obj;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// Get document, or throw exception on error
|
||||
try {
|
||||
const data = await fetch(
|
||||
'https://s3-eu-west-1.amazonaws.com/snowplow-hosted-assets/third-party/referer-parser/referers-latest.json',
|
||||
).then((res) => res.json());
|
||||
|
||||
fs.writeFileSync(
|
||||
path.resolve(__dirname, '../../worker/src/referrers/index.ts'),
|
||||
[
|
||||
'// This file is generated by the script get-referrers.ts',
|
||||
'',
|
||||
'// The data is fetch from snowplow-referer-parser https://github.com/snowplow-referer-parser/referer-parser',
|
||||
`// The orginal referers.yml is based on Piwik's SearchEngines.php and Socials.php, copyright 2012 Matthieu Aubry and available under the GNU General Public License v3.`,
|
||||
'',
|
||||
`const referrers: Record<string, { type: string, name: string }> = ${JSON.stringify(
|
||||
{
|
||||
...transform(data),
|
||||
...extraReferrers,
|
||||
},
|
||||
)} as const;`,
|
||||
'export default referrers;',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,3 +1,5 @@
|
||||
export * from './crypto';
|
||||
export * from './profileId';
|
||||
export * from './parser-user-agent';
|
||||
export * from './parse-referrer';
|
||||
export * from './id';
|
||||
|
||||
117
packages/common/server/parse-referrer.test.ts
Normal file
117
packages/common/server/parse-referrer.test.ts
Normal file
@@ -0,0 +1,117 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { getReferrerWithQuery, parseReferrer } from './parse-referrer';
|
||||
|
||||
describe('parseReferrer', () => {
|
||||
it('should handle undefined or empty URLs', () => {
|
||||
expect(parseReferrer(undefined)).toEqual({
|
||||
name: '',
|
||||
type: '',
|
||||
url: '',
|
||||
});
|
||||
|
||||
expect(parseReferrer('')).toEqual({
|
||||
name: '',
|
||||
type: '',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse valid referrer URLs', () => {
|
||||
expect(parseReferrer('https://google.com/search?q=test')).toEqual({
|
||||
name: 'Google',
|
||||
type: 'search',
|
||||
url: 'https://google.com/search?q=test',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle www prefix in hostnames', () => {
|
||||
expect(parseReferrer('https://www.twitter.com/user')).toEqual({
|
||||
name: 'Twitter',
|
||||
type: 'social',
|
||||
url: 'https://www.twitter.com/user',
|
||||
});
|
||||
|
||||
expect(parseReferrer('https://twitter.com/user')).toEqual({
|
||||
name: 'Twitter',
|
||||
type: 'social',
|
||||
url: 'https://twitter.com/user',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle unknown referrers', () => {
|
||||
expect(parseReferrer('https://unknown-site.com')).toEqual({
|
||||
name: '',
|
||||
type: '',
|
||||
url: 'https://unknown-site.com',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle invalid URLs', () => {
|
||||
expect(parseReferrer('not-a-url')).toEqual({
|
||||
name: '',
|
||||
type: '',
|
||||
url: 'not-a-url',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('getReferrerWithQuery', () => {
|
||||
it('should handle undefined or empty query', () => {
|
||||
expect(getReferrerWithQuery(undefined)).toBeNull();
|
||||
expect(getReferrerWithQuery({})).toBeNull();
|
||||
});
|
||||
|
||||
it('should parse utm_source parameter', () => {
|
||||
expect(getReferrerWithQuery({ utm_source: 'google' })).toEqual({
|
||||
name: 'Google',
|
||||
type: 'search',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse ref parameter', () => {
|
||||
expect(getReferrerWithQuery({ ref: 'facebook' })).toEqual({
|
||||
name: 'Facebook',
|
||||
type: 'social',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse utm_referrer parameter', () => {
|
||||
expect(getReferrerWithQuery({ utm_referrer: 'twitter' })).toEqual({
|
||||
name: 'Twitter',
|
||||
type: 'social',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle case-insensitive matching', () => {
|
||||
expect(getReferrerWithQuery({ utm_source: 'GoOgLe' })).toEqual({
|
||||
name: 'Google',
|
||||
type: 'search',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle unknown sources', () => {
|
||||
expect(getReferrerWithQuery({ utm_source: 'unknown-source' })).toEqual({
|
||||
name: 'unknown-source',
|
||||
type: '',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
|
||||
it('should prioritize utm_source over ref and utm_referrer', () => {
|
||||
expect(
|
||||
getReferrerWithQuery({
|
||||
utm_source: 'google',
|
||||
ref: 'facebook',
|
||||
utm_referrer: 'twitter',
|
||||
}),
|
||||
).toEqual({
|
||||
name: 'Google',
|
||||
type: 'search',
|
||||
url: '',
|
||||
});
|
||||
});
|
||||
});
|
||||
66
packages/common/server/parse-referrer.ts
Normal file
66
packages/common/server/parse-referrer.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import { stripTrailingSlash } from '../src/string';
|
||||
|
||||
import referrers from './referrers';
|
||||
|
||||
function getHostname(url: string | undefined) {
|
||||
if (!url) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
return new URL(url).hostname;
|
||||
} catch (e) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
export function parseReferrer(url: string | undefined) {
|
||||
const hostname = getHostname(url);
|
||||
const match = referrers[hostname] ?? referrers[hostname.replace('www.', '')];
|
||||
|
||||
return {
|
||||
name: match?.name ?? '',
|
||||
type: match?.type ?? '',
|
||||
url: stripTrailingSlash(url ?? ''),
|
||||
};
|
||||
}
|
||||
|
||||
export function getReferrerWithQuery(
|
||||
query: Record<string, string> | undefined,
|
||||
) {
|
||||
if (!query) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const source = (
|
||||
query.utm_source ??
|
||||
query.ref ??
|
||||
query.utm_referrer ??
|
||||
''
|
||||
).toLowerCase();
|
||||
|
||||
if (source === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const match =
|
||||
referrers[source] ||
|
||||
referrers[`${source}.com`] ||
|
||||
Object.values(referrers).find(
|
||||
(referrer) => referrer.name.toLowerCase() === source,
|
||||
);
|
||||
|
||||
if (match) {
|
||||
return {
|
||||
name: match.name,
|
||||
type: match.type,
|
||||
url: '',
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
name: source,
|
||||
type: '',
|
||||
url: '',
|
||||
};
|
||||
}
|
||||
@@ -68,6 +68,7 @@ const parse = (ua: string): UAParser.IResult => {
|
||||
return res;
|
||||
};
|
||||
|
||||
export type UserAgentInfo = ReturnType<typeof parseUserAgent>;
|
||||
export function parseUserAgent(
|
||||
ua?: string | null,
|
||||
overrides?: Record<string, unknown>,
|
||||
@@ -80,13 +81,35 @@ export function parseUserAgent(
|
||||
}
|
||||
|
||||
return {
|
||||
os: overrides?.__os || res.os.name,
|
||||
osVersion: overrides?.__osVersion || res.os.version,
|
||||
browser: overrides?.__browser || res.browser.name,
|
||||
browserVersion: overrides?.__browserVersion || res.browser.version,
|
||||
device: overrides?.__device || res.device.type || getDevice(ua),
|
||||
brand: overrides?.__brand || res.device.vendor,
|
||||
model: overrides?.__model || res.device.model,
|
||||
os:
|
||||
typeof overrides?.__os === 'string' && overrides?.__os
|
||||
? overrides?.__os
|
||||
: res.os.name,
|
||||
osVersion:
|
||||
typeof overrides?.__osVersion === 'string' && overrides?.__osVersion
|
||||
? overrides?.__osVersion
|
||||
: res.os.version,
|
||||
browser:
|
||||
typeof overrides?.__browser === 'string' && overrides?.__browser
|
||||
? overrides?.__browser
|
||||
: res.browser.name,
|
||||
browserVersion:
|
||||
typeof overrides?.__browserVersion === 'string' &&
|
||||
overrides?.__browserVersion
|
||||
? overrides?.__browserVersion
|
||||
: res.browser.version,
|
||||
device:
|
||||
typeof overrides?.__device === 'string' && overrides?.__device
|
||||
? overrides?.__device
|
||||
: res.device.type || getDevice(ua),
|
||||
brand:
|
||||
typeof overrides?.__brand === 'string' && overrides?.__brand
|
||||
? overrides?.__brand
|
||||
: res.device.vendor,
|
||||
model:
|
||||
typeof overrides?.__model === 'string' && overrides?.__model
|
||||
? overrides?.__model
|
||||
: res.device.model,
|
||||
isServer: false,
|
||||
} as const;
|
||||
}
|
||||
|
||||
2785
packages/common/server/referrers/index.ts
Normal file
2785
packages/common/server/referrers/index.ts
Normal file
File diff suppressed because it is too large
Load Diff
5
packages/common/server/referrers/referrers.readme.md
Normal file
5
packages/common/server/referrers/referrers.readme.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Snowplow Referer Parser
|
||||
|
||||
The file index.ts in this dir is generated from snowplows referer database [Snowplow Referer Parser](https://github.com/snowplow-referer-parser/referer-parser).
|
||||
|
||||
The orginal [referers.yml](https://github.com/snowplow-referer-parser/referer-parser/blob/master/resources/referers.yml) is based on Piwik's SearchEngines.php and Socials.php, copyright 2012 Matthieu Aubry and available under the GNU General Public License v3.
|
||||
48
packages/common/src/object.test.ts
Normal file
48
packages/common/src/object.test.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { toDots } from './object';
|
||||
|
||||
describe('toDots', () => {
|
||||
it('should convert an object to a dot object', () => {
|
||||
const obj = {
|
||||
a: 1,
|
||||
b: 2,
|
||||
array: ['1', '2', '3'],
|
||||
arrayWithObjects: [{ a: 1 }, { b: 2 }, { c: 3 }],
|
||||
objectWithArrays: { a: [1, 2, 3] },
|
||||
null: null,
|
||||
undefined: undefined,
|
||||
empty: '',
|
||||
jsonString: '{"a": 1, "b": 2}',
|
||||
};
|
||||
expect(toDots(obj)).toEqual({
|
||||
a: '1',
|
||||
b: '2',
|
||||
'array.0': '1',
|
||||
'array.1': '2',
|
||||
'array.2': '3',
|
||||
'arrayWithObjects.0.a': '1',
|
||||
'arrayWithObjects.1.b': '2',
|
||||
'arrayWithObjects.2.c': '3',
|
||||
'objectWithArrays.a.0': '1',
|
||||
'objectWithArrays.a.1': '2',
|
||||
'objectWithArrays.a.2': '3',
|
||||
'jsonString.a': '1',
|
||||
'jsonString.b': '2',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle malformed JSON strings gracefully', () => {
|
||||
const obj = {
|
||||
validJson: '{"key":"value"}',
|
||||
malformedJson: '{"key":"unterminated string',
|
||||
startsWithBrace: '{not json at all',
|
||||
startsWithBracket: '[also not json',
|
||||
regularString: 'normal string',
|
||||
};
|
||||
|
||||
expect(toDots(obj)).toEqual({
|
||||
'validJson.key': 'value',
|
||||
regularString: 'normal string',
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,5 +1,18 @@
|
||||
import { anyPass, assocPath, isEmpty, isNil, reject } from 'ramda';
|
||||
|
||||
function isValidJsonString(value: string): boolean {
|
||||
return (
|
||||
(value.startsWith('{') && value.endsWith('}')) ||
|
||||
(value.startsWith('[') && value.endsWith(']'))
|
||||
);
|
||||
}
|
||||
function isMalformedJsonString(value: string): boolean {
|
||||
return (
|
||||
(value.startsWith('{') && !value.endsWith('}')) ||
|
||||
(value.startsWith('[') && !value.endsWith(']'))
|
||||
);
|
||||
}
|
||||
|
||||
export function toDots(
|
||||
obj: Record<string, unknown>,
|
||||
path = '',
|
||||
@@ -19,10 +32,28 @@ export function toDots(
|
||||
};
|
||||
}
|
||||
|
||||
if (value === undefined || value === null) {
|
||||
if (value === undefined || value === null || value === '') {
|
||||
return acc;
|
||||
}
|
||||
|
||||
if (typeof value === 'string' && isMalformedJsonString(value)) {
|
||||
// Skip it
|
||||
return acc;
|
||||
}
|
||||
|
||||
// Fix nested json strings - but catch parse errors for malformed JSON
|
||||
if (typeof value === 'string' && isValidJsonString(value)) {
|
||||
try {
|
||||
return {
|
||||
...acc,
|
||||
...toDots(JSON.parse(value), `${path}${key}.`),
|
||||
};
|
||||
} catch {
|
||||
// Skip it
|
||||
return acc;
|
||||
}
|
||||
}
|
||||
|
||||
const cleanedValue =
|
||||
typeof value === 'string'
|
||||
? removeInvalidSurrogates(value).trim()
|
||||
|
||||
47
packages/db/code-migrations/5-add-imports-table.sql
Normal file
47
packages/db/code-migrations/5-add-imports-table.sql
Normal file
@@ -0,0 +1,47 @@
|
||||
CREATE TABLE IF NOT EXISTS events_imports_replicated ON CLUSTER '{cluster}' (
|
||||
`id` UUID DEFAULT generateUUIDv4(),
|
||||
`name` LowCardinality(String),
|
||||
`sdk_name` LowCardinality(String),
|
||||
`sdk_version` LowCardinality(String),
|
||||
`device_id` String CODEC(ZSTD(3)),
|
||||
`profile_id` String CODEC(ZSTD(3)),
|
||||
`project_id` String CODEC(ZSTD(3)),
|
||||
`session_id` String CODEC(LZ4),
|
||||
`path` String CODEC(ZSTD(3)),
|
||||
`origin` String CODEC(ZSTD(3)),
|
||||
`referrer` String CODEC(ZSTD(3)),
|
||||
`referrer_name` String CODEC(ZSTD(3)),
|
||||
`referrer_type` LowCardinality(String),
|
||||
`duration` UInt64 CODEC(Delta(4), LZ4),
|
||||
`properties` Map(String, String) CODEC(ZSTD(3)),
|
||||
`created_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3)),
|
||||
`country` LowCardinality(FixedString(2)),
|
||||
`city` String,
|
||||
`region` LowCardinality(String),
|
||||
`longitude` Nullable(Float32) CODEC(Gorilla, LZ4),
|
||||
`latitude` Nullable(Float32) CODEC(Gorilla, LZ4),
|
||||
`os` LowCardinality(String),
|
||||
`os_version` LowCardinality(String),
|
||||
`browser` LowCardinality(String),
|
||||
`browser_version` LowCardinality(String),
|
||||
`device` LowCardinality(String),
|
||||
`brand` LowCardinality(String),
|
||||
`model` LowCardinality(String),
|
||||
`imported_at` Nullable(DateTime) CODEC(Delta(4), LZ4),
|
||||
`import_id` String CODEC(ZSTD(3)),
|
||||
`import_status` LowCardinality(String) DEFAULT 'pending',
|
||||
`imported_at_meta` DateTime DEFAULT now()
|
||||
)
|
||||
ENGINE = ReplicatedMergeTree('/clickhouse/{installation}/{cluster}/tables/{shard}/openpanel/v1/{table}', '{replica}')
|
||||
PARTITION BY toYYYYMM(imported_at_meta)
|
||||
ORDER BY (import_id, created_at)
|
||||
SETTINGS index_granularity = 8192;
|
||||
|
||||
---
|
||||
|
||||
CREATE TABLE IF NOT EXISTS events_imports ON CLUSTER '{cluster}' AS events_imports_replicated
|
||||
ENGINE = Distributed('{cluster}', currentDatabase(), events_imports_replicated, cityHash64(import_id));
|
||||
|
||||
---
|
||||
|
||||
ALTER TABLE events_imports_replicated ON CLUSTER '{cluster}' MODIFY TTL imported_at_meta + INTERVAL 7 DAY;
|
||||
90
packages/db/code-migrations/5-add-imports-table.ts
Normal file
90
packages/db/code-migrations/5-add-imports-table.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { TABLE_NAMES } from '../src/clickhouse/client';
|
||||
import {
|
||||
createTable,
|
||||
modifyTTL,
|
||||
runClickhouseMigrationCommands,
|
||||
} from '../src/clickhouse/migration';
|
||||
import { getIsCluster } from './helpers';
|
||||
|
||||
export async function up() {
|
||||
const isClustered = getIsCluster();
|
||||
|
||||
const sqls: string[] = [
|
||||
...createTable({
|
||||
name: 'events_imports',
|
||||
columns: [
|
||||
// Same columns as events table
|
||||
'`id` UUID DEFAULT generateUUIDv4()',
|
||||
'`name` LowCardinality(String)',
|
||||
'`sdk_name` LowCardinality(String)',
|
||||
'`sdk_version` LowCardinality(String)',
|
||||
'`device_id` String CODEC(ZSTD(3))',
|
||||
'`profile_id` String CODEC(ZSTD(3))',
|
||||
'`project_id` String CODEC(ZSTD(3))',
|
||||
'`session_id` String CODEC(LZ4)',
|
||||
'`path` String CODEC(ZSTD(3))',
|
||||
'`origin` String CODEC(ZSTD(3))',
|
||||
'`referrer` String CODEC(ZSTD(3))',
|
||||
'`referrer_name` String CODEC(ZSTD(3))',
|
||||
'`referrer_type` LowCardinality(String)',
|
||||
'`duration` UInt64 CODEC(Delta(4), LZ4)',
|
||||
'`properties` Map(String, String) CODEC(ZSTD(3))',
|
||||
'`created_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3))',
|
||||
'`country` LowCardinality(FixedString(2))',
|
||||
'`city` String',
|
||||
'`region` LowCardinality(String)',
|
||||
'`longitude` Nullable(Float32) CODEC(Gorilla, LZ4)',
|
||||
'`latitude` Nullable(Float32) CODEC(Gorilla, LZ4)',
|
||||
'`os` LowCardinality(String)',
|
||||
'`os_version` LowCardinality(String)',
|
||||
'`browser` LowCardinality(String)',
|
||||
'`browser_version` LowCardinality(String)',
|
||||
'`device` LowCardinality(String)',
|
||||
'`brand` LowCardinality(String)',
|
||||
'`model` LowCardinality(String)',
|
||||
'`imported_at` Nullable(DateTime) CODEC(Delta(4), LZ4)',
|
||||
|
||||
// Additional metadata columns for import tracking
|
||||
'`import_id` String CODEC(ZSTD(3))',
|
||||
"`import_status` LowCardinality(String) DEFAULT 'pending'",
|
||||
'`imported_at_meta` DateTime DEFAULT now()',
|
||||
],
|
||||
orderBy: ['import_id', 'created_at'],
|
||||
partitionBy: 'toYYYYMM(imported_at_meta)',
|
||||
settings: {
|
||||
index_granularity: 8192,
|
||||
},
|
||||
distributionHash: 'cityHash64(import_id)',
|
||||
replicatedVersion: '1',
|
||||
isClustered,
|
||||
}),
|
||||
];
|
||||
|
||||
// Add TTL policy for auto-cleanup after 7 days
|
||||
sqls.push(
|
||||
modifyTTL({
|
||||
tableName: 'events_imports',
|
||||
isClustered,
|
||||
ttl: 'imported_at_meta + INTERVAL 7 DAY',
|
||||
}),
|
||||
);
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(__filename.replace('.ts', '.sql')),
|
||||
sqls
|
||||
.map((sql) =>
|
||||
sql
|
||||
.trim()
|
||||
.replace(/;$/, '')
|
||||
.replace(/\n{2,}/g, '\n')
|
||||
.concat(';'),
|
||||
)
|
||||
.join('\n\n---\n\n'),
|
||||
);
|
||||
|
||||
if (!process.argv.includes('--dry')) {
|
||||
await runClickhouseMigrationCommands(sqls);
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
export * from './src/prisma-client';
|
||||
export * from './src/clickhouse/client';
|
||||
export * from './src/clickhouse/csv';
|
||||
export * from './src/sql-builder';
|
||||
export * from './src/services/chart.service';
|
||||
export * from './src/services/clients.service';
|
||||
@@ -23,5 +24,6 @@ export * from './src/services/access.service';
|
||||
export * from './src/buffers';
|
||||
export * from './src/types';
|
||||
export * from './src/clickhouse/query-builder';
|
||||
export * from './src/services/import.service';
|
||||
export * from './src/services/overview.service';
|
||||
export * from './src/session-context';
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
"with-env": "dotenv -e ../../.env -c --"
|
||||
},
|
||||
"dependencies": {
|
||||
"@clickhouse/client": "^1.2.0",
|
||||
"@clickhouse/client": "^1.12.1",
|
||||
"@openpanel/common": "workspace:*",
|
||||
"@openpanel/constants": "workspace:*",
|
||||
"@openpanel/json": "workspace:*",
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
-- CreateTable
|
||||
CREATE TABLE "public"."imports" (
|
||||
"id" UUID NOT NULL DEFAULT gen_random_uuid(),
|
||||
"projectId" TEXT NOT NULL,
|
||||
"provider" TEXT NOT NULL,
|
||||
"sourceType" TEXT NOT NULL,
|
||||
"sourceLocation" TEXT NOT NULL,
|
||||
"jobId" TEXT,
|
||||
"status" TEXT NOT NULL,
|
||||
"config" JSONB NOT NULL DEFAULT '{}',
|
||||
"totalEvents" INTEGER NOT NULL DEFAULT 0,
|
||||
"processedEvents" INTEGER NOT NULL DEFAULT 0,
|
||||
"errorMessage" TEXT,
|
||||
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
"completedAt" TIMESTAMP(3),
|
||||
"updatedAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
CONSTRAINT "imports_pkey" PRIMARY KEY ("id")
|
||||
);
|
||||
|
||||
-- AddForeignKey
|
||||
ALTER TABLE "public"."imports" ADD CONSTRAINT "imports_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "public"."projects"("id") ON DELETE CASCADE ON UPDATE CASCADE;
|
||||
@@ -0,0 +1,13 @@
|
||||
/*
|
||||
Warnings:
|
||||
|
||||
- You are about to drop the column `provider` on the `imports` table. All the data in the column will be lost.
|
||||
- You are about to drop the column `sourceLocation` on the `imports` table. All the data in the column will be lost.
|
||||
- You are about to drop the column `sourceType` on the `imports` table. All the data in the column will be lost.
|
||||
|
||||
*/
|
||||
-- AlterTable
|
||||
ALTER TABLE "public"."imports" DROP COLUMN "provider",
|
||||
DROP COLUMN "sourceLocation",
|
||||
DROP COLUMN "sourceType",
|
||||
ALTER COLUMN "config" DROP DEFAULT;
|
||||
@@ -0,0 +1,2 @@
|
||||
-- AlterTable
|
||||
ALTER TABLE "public"."imports" ADD COLUMN "statusMessage" TEXT;
|
||||
@@ -0,0 +1,3 @@
|
||||
-- AlterTable
|
||||
ALTER TABLE "public"."imports" ADD COLUMN "currentBatch" INTEGER NOT NULL DEFAULT 0,
|
||||
ADD COLUMN "currentStep" TEXT;
|
||||
@@ -0,0 +1,14 @@
|
||||
/*
|
||||
Warnings:
|
||||
|
||||
- Changed the type of `status` on the `imports` table. No cast exists, the column would be dropped and recreated, which cannot be done if there is data, since the column is required.
|
||||
- Made the column `currentStep` on table `imports` required. This step will fail if there are existing NULL values in that column.
|
||||
|
||||
*/
|
||||
-- CreateEnum
|
||||
CREATE TYPE "public"."ImportStatus" AS ENUM ('pending', 'processing', 'completed', 'failed');
|
||||
|
||||
-- AlterTable
|
||||
ALTER TABLE "public"."imports" DROP COLUMN "status",
|
||||
ADD COLUMN "status" "public"."ImportStatus" NOT NULL,
|
||||
ALTER COLUMN "currentStep" SET NOT NULL;
|
||||
@@ -0,0 +1,2 @@
|
||||
-- AlterTable
|
||||
ALTER TABLE "public"."imports" ALTER COLUMN "currentStep" DROP NOT NULL;
|
||||
@@ -0,0 +1,4 @@
|
||||
-- AlterTable
|
||||
ALTER TABLE "public"."imports" ALTER COLUMN "currentBatch" DROP NOT NULL,
|
||||
ALTER COLUMN "currentBatch" DROP DEFAULT,
|
||||
ALTER COLUMN "currentBatch" SET DATA TYPE TEXT;
|
||||
@@ -194,6 +194,7 @@ model Project {
|
||||
|
||||
notificationRules NotificationRule[]
|
||||
notifications Notification[]
|
||||
imports Import[]
|
||||
|
||||
// When deleteAt > now(), the project will be deleted
|
||||
deleteAt DateTime?
|
||||
@@ -467,3 +468,31 @@ model ResetPassword {
|
||||
|
||||
@@map("reset_password")
|
||||
}
|
||||
|
||||
enum ImportStatus {
|
||||
pending
|
||||
processing
|
||||
completed
|
||||
failed
|
||||
}
|
||||
|
||||
model Import {
|
||||
id String @id @default(dbgenerated("gen_random_uuid()")) @db.Uuid
|
||||
projectId String
|
||||
project Project @relation(fields: [projectId], references: [id], onDelete: Cascade)
|
||||
jobId String? // BullMQ job ID
|
||||
status ImportStatus
|
||||
statusMessage String? // Human-readable current step like "Importing events (Feb 2025)", "Generating session IDs"
|
||||
errorMessage String?
|
||||
/// [IPrismaImportConfig]
|
||||
config Json
|
||||
totalEvents Int @default(0)
|
||||
processedEvents Int @default(0)
|
||||
currentStep String?
|
||||
currentBatch String? // String date 2020-01-01
|
||||
createdAt DateTime @default(now())
|
||||
completedAt DateTime?
|
||||
updatedAt DateTime @default(now()) @updatedAt
|
||||
|
||||
@@map("imports")
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { generateSecureId } from '@openpanel/common/server/id';
|
||||
import { generateSecureId } from '@openpanel/common/server';
|
||||
import { type ILogger, createLogger } from '@openpanel/logger';
|
||||
import { getRedisCache, runEvery } from '@openpanel/redis';
|
||||
|
||||
|
||||
@@ -10,6 +10,18 @@ import {
|
||||
} from 'vitest';
|
||||
import { ch } from '../clickhouse/client';
|
||||
|
||||
const clickhouseSettings = {
|
||||
async_insert: 1,
|
||||
http_headers_progress_interval_ms: '50000',
|
||||
input_format_parallel_parsing: 1,
|
||||
max_execution_time: 300,
|
||||
max_http_get_redirects: '0',
|
||||
max_insert_block_size: '500000',
|
||||
send_progress_in_http_headers: 1,
|
||||
wait_end_of_query: 1,
|
||||
wait_for_async_insert: 1,
|
||||
};
|
||||
|
||||
// Mock transformEvent to avoid circular dependency with buffers -> services -> buffers
|
||||
vi.mock('../services/event.service', () => ({
|
||||
transformEvent: (event: any) => ({
|
||||
@@ -127,6 +139,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
duration: 1000,
|
||||
},
|
||||
],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
|
||||
const sessionKey = `event_buffer:session:${first.session_id}`;
|
||||
@@ -171,6 +184,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
format: 'JSONEachRow',
|
||||
table: 'events',
|
||||
values: [first, end],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
const sessionKey = `event_buffer:session:${first.session_id}`;
|
||||
const storedEvents = await redis.lrange(sessionKey, 0, -1);
|
||||
@@ -502,6 +516,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
format: 'JSONEachRow',
|
||||
table: 'events',
|
||||
values: [end],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
|
||||
const sessionKey = `event_buffer:session:${s}`;
|
||||
@@ -552,6 +567,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
format: 'JSONEachRow',
|
||||
table: 'events',
|
||||
values: [view1, view2, view3, end],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
|
||||
// Session should be completely empty and removed
|
||||
@@ -596,6 +612,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
format: 'JSONEachRow',
|
||||
table: 'events',
|
||||
values: [{ ...view1, duration: 1000 }],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
|
||||
// Session should be REMOVED from ready_sessions (only 1 event left)
|
||||
@@ -620,6 +637,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
format: 'JSONEachRow',
|
||||
table: 'events',
|
||||
values: [{ ...view2, duration: 1000 }],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
|
||||
// Session should be REMOVED again (only 1 event left)
|
||||
@@ -667,6 +685,7 @@ describe('EventBuffer with real Redis', () => {
|
||||
format: 'JSONEachRow',
|
||||
table: 'events',
|
||||
values: [view, end],
|
||||
clickhouse_settings: clickhouseSettings,
|
||||
});
|
||||
|
||||
// NOW it should be removed from ready_sessions (because it's empty)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { Readable } from 'node:stream';
|
||||
import type { ClickHouseSettings, ResponseJSON } from '@clickhouse/client';
|
||||
import { ClickHouseLogLevel, createClient } from '@clickhouse/client';
|
||||
import sqlstring from 'sqlstring';
|
||||
@@ -23,13 +24,10 @@ type WarnLogParams = LogParams & { err?: Error };
|
||||
|
||||
class CustomLogger implements Logger {
|
||||
trace({ message, args }: LogParams) {
|
||||
logger.debug(message, args);
|
||||
logger.info(message, args);
|
||||
}
|
||||
debug({ message, args }: LogParams) {
|
||||
if (message.includes('Query:') && args?.response_status === 200) {
|
||||
return;
|
||||
}
|
||||
logger.debug(message, args);
|
||||
logger.info(message, args);
|
||||
}
|
||||
info({ message, args }: LogParams) {
|
||||
logger.info(message, args);
|
||||
@@ -56,14 +54,15 @@ export const TABLE_NAMES = {
|
||||
event_property_values_mv: 'event_property_values_mv',
|
||||
cohort_events_mv: 'cohort_events_mv',
|
||||
sessions: 'sessions',
|
||||
events_imports: 'events_imports',
|
||||
};
|
||||
|
||||
export const CLICKHOUSE_OPTIONS: NodeClickHouseClientConfigOptions = {
|
||||
max_open_connections: 30,
|
||||
request_timeout: 60000,
|
||||
request_timeout: 300000,
|
||||
keep_alive: {
|
||||
enabled: true,
|
||||
idle_socket_ttl: 8000,
|
||||
idle_socket_ttl: 60000,
|
||||
},
|
||||
compression: {
|
||||
request: true,
|
||||
@@ -87,7 +86,7 @@ const cleanQuery = (query?: string) =>
|
||||
? query.replace(/\n/g, '').replace(/\s+/g, ' ').trim()
|
||||
: undefined;
|
||||
|
||||
async function withRetry<T>(
|
||||
export async function withRetry<T>(
|
||||
operation: () => Promise<T>,
|
||||
maxRetries = 3,
|
||||
baseDelay = 500,
|
||||
@@ -132,7 +131,34 @@ export const ch = new Proxy(originalCh, {
|
||||
const value = Reflect.get(target, property, receiver);
|
||||
|
||||
if (property === 'insert') {
|
||||
return (...args: any[]) => withRetry(() => value.apply(target, args));
|
||||
return (...args: any[]) =>
|
||||
withRetry(() => {
|
||||
args[0].clickhouse_settings = {
|
||||
// Allow bigger HTTP payloads/time to stream rows
|
||||
async_insert: 1,
|
||||
wait_for_async_insert: 1,
|
||||
// Increase insert timeouts and buffer sizes for large batches
|
||||
max_execution_time: 300,
|
||||
max_insert_block_size: '500000',
|
||||
max_http_get_redirects: '0',
|
||||
// Ensure JSONEachRow stays efficient
|
||||
input_format_parallel_parsing: 1,
|
||||
// Keep long-running inserts/queries from idling out at proxies by sending progress headers
|
||||
send_progress_in_http_headers: 1,
|
||||
http_headers_progress_interval_ms: '50000',
|
||||
// Ensure server holds the connection until the query is finished
|
||||
wait_end_of_query: 1,
|
||||
...args[0].clickhouse_settings,
|
||||
};
|
||||
return value.apply(target, args);
|
||||
});
|
||||
}
|
||||
|
||||
if (property === 'command') {
|
||||
return (...args: any[]) =>
|
||||
withRetry(() => {
|
||||
return value.apply(target, args);
|
||||
});
|
||||
}
|
||||
|
||||
return value;
|
||||
@@ -177,6 +203,34 @@ export async function chQueryWithMeta<T extends Record<string, any>>(
|
||||
return response;
|
||||
}
|
||||
|
||||
export async function chInsertCSV(tableName: string, rows: string[]) {
|
||||
try {
|
||||
const now = performance.now();
|
||||
// Create a readable stream in binary mode for CSV (similar to EventBuffer)
|
||||
const csvStream = Readable.from(rows.join('\n'), {
|
||||
objectMode: false,
|
||||
});
|
||||
|
||||
await ch.insert({
|
||||
table: tableName,
|
||||
values: csvStream,
|
||||
format: 'CSV',
|
||||
clickhouse_settings: {
|
||||
format_csv_allow_double_quotes: 1,
|
||||
format_csv_allow_single_quotes: 0,
|
||||
},
|
||||
});
|
||||
|
||||
logger.info('CSV Insert successful', {
|
||||
elapsed: performance.now() - now,
|
||||
rows: rows.length,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error('CSV Insert failed:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export async function chQuery<T extends Record<string, any>>(
|
||||
query: string,
|
||||
clickhouseSettings?: ClickHouseSettings,
|
||||
|
||||
53
packages/db/src/clickhouse/csv.ts
Normal file
53
packages/db/src/clickhouse/csv.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
// ClickHouse Map(String, String) format in CSV uses single quotes, not JSON double quotes
|
||||
// Format: '{'key1':'value1','key2':'value2'}'
|
||||
// Single quotes inside values must be escaped with backslash: \'
|
||||
// We also need to escape newlines and control characters to prevent CSV parsing issues
|
||||
const escapeMapValue = (str: string) => {
|
||||
return str
|
||||
.replace(/\\/g, '\\\\') // Escape backslashes first
|
||||
.replace(/'/g, "\\'") // Escape single quotes
|
||||
.replace(/\n/g, '\\n') // Escape newlines
|
||||
.replace(/\r/g, '\\r') // Escape carriage returns
|
||||
.replace(/\t/g, '\\t') // Escape tabs
|
||||
.replace(/\0/g, '\\0'); // Escape null bytes
|
||||
};
|
||||
|
||||
export const csvEscapeJson = (
|
||||
value: Record<string, unknown> | null | undefined,
|
||||
): string => {
|
||||
if (value == null) return '';
|
||||
|
||||
// Normalize to strings if your column is Map(String,String)
|
||||
const normalized: Record<string, string> = Object.fromEntries(
|
||||
Object.entries(value).map(([k, v]) => [
|
||||
String(k),
|
||||
v == null ? '' : String(v),
|
||||
]),
|
||||
);
|
||||
|
||||
// Empty object should return empty Map (without quotes, csvEscapeField will handle if needed)
|
||||
if (Object.keys(normalized).length === 0) return '{}';
|
||||
|
||||
const pairs = Object.entries(normalized)
|
||||
.map(([k, v]) => `'${escapeMapValue(k)}':'${escapeMapValue(v)}'`)
|
||||
.join(',');
|
||||
|
||||
// Return Map format without outer quotes - csvEscapeField will handle CSV escaping
|
||||
// This allows csvEscapeField to properly wrap/escape the entire field if it contains newlines/quotes
|
||||
return csvEscapeField(`{${pairs}}`);
|
||||
};
|
||||
|
||||
// Escape a CSV field - wrap in double quotes if it contains commas, quotes, or newlines
|
||||
// Double quotes inside must be doubled (""), per CSV standard
|
||||
export const csvEscapeField = (value: string | number): string => {
|
||||
const str = String(value);
|
||||
|
||||
// If field contains commas, quotes, or newlines, it must be quoted
|
||||
if (/[,"\n\r]/.test(str)) {
|
||||
// Escape double quotes by doubling them
|
||||
const escaped = str.replace(/"/g, '""');
|
||||
return `"${escaped}"`;
|
||||
}
|
||||
|
||||
return str;
|
||||
};
|
||||
@@ -115,6 +115,22 @@ ENGINE = Distributed('{cluster}', currentDatabase(), ${replicated(tableName)}, $
|
||||
];
|
||||
}
|
||||
|
||||
export const modifyTTL = ({
|
||||
tableName,
|
||||
isClustered,
|
||||
ttl,
|
||||
}: {
|
||||
tableName: string;
|
||||
isClustered: boolean;
|
||||
ttl: string;
|
||||
}) => {
|
||||
if (isClustered) {
|
||||
return `ALTER TABLE ${replicated(tableName)} ON CLUSTER '{cluster}' MODIFY TTL ${ttl}`;
|
||||
}
|
||||
|
||||
return `ALTER TABLE ${tableName} MODIFY TTL ${ttl}`;
|
||||
};
|
||||
|
||||
/**
|
||||
* Generates ALTER TABLE statements for adding columns
|
||||
*/
|
||||
|
||||
@@ -141,6 +141,10 @@ export type IServiceCreateEventPayload = Omit<
|
||||
IServiceEvent,
|
||||
'id' | 'importedAt' | 'profile' | 'meta'
|
||||
>;
|
||||
export type IServiceImportedEventPayload = Omit<
|
||||
IServiceEvent,
|
||||
'profile' | 'meta'
|
||||
>;
|
||||
|
||||
export interface IServiceEvent {
|
||||
id: string;
|
||||
|
||||
784
packages/db/src/services/import.service.ts
Normal file
784
packages/db/src/services/import.service.ts
Normal file
@@ -0,0 +1,784 @@
|
||||
import type { ILogger } from '@openpanel/logger';
|
||||
import sqlstring from 'sqlstring';
|
||||
import {
|
||||
TABLE_NAMES,
|
||||
ch,
|
||||
chInsertCSV,
|
||||
convertClickhouseDateToJs,
|
||||
formatClickhouseDate,
|
||||
} from '../clickhouse/client';
|
||||
import { csvEscapeField, csvEscapeJson } from '../clickhouse/csv';
|
||||
import { type Prisma, db } from '../prisma-client';
|
||||
import type { IClickhouseEvent } from './event.service';
|
||||
|
||||
export interface ImportStageResult {
|
||||
importId: string;
|
||||
totalEvents: number;
|
||||
insertedEvents: number;
|
||||
}
|
||||
|
||||
export interface ImportProgress {
|
||||
importId: string;
|
||||
totalEvents: number;
|
||||
insertedEvents: number;
|
||||
status: 'pending' | 'processing' | 'processed' | 'failed';
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a batch of events into the imports staging table
|
||||
*/
|
||||
export async function insertImportBatch(
|
||||
events: IClickhouseEvent[],
|
||||
importId: string,
|
||||
): Promise<ImportStageResult> {
|
||||
if (events.length === 0) {
|
||||
return { importId, totalEvents: 0, insertedEvents: 0 };
|
||||
}
|
||||
|
||||
// Important to have same order as events_imports table
|
||||
// CSV format: properly quotes fields that need it
|
||||
const csvRows = events.map((event) => {
|
||||
// Properties need to be converted to JSON for Map(String, String)
|
||||
// All fields must be CSV-escaped when joining with commas
|
||||
const fields = [
|
||||
csvEscapeField(event.id || ''),
|
||||
csvEscapeField(event.name),
|
||||
csvEscapeField(event.sdk_name || ''),
|
||||
csvEscapeField(event.sdk_version || ''),
|
||||
csvEscapeField(event.device_id || ''),
|
||||
csvEscapeField(event.profile_id || ''),
|
||||
csvEscapeField(event.project_id || ''),
|
||||
csvEscapeField(event.session_id || ''),
|
||||
csvEscapeField(event.path),
|
||||
csvEscapeField(event.origin || ''),
|
||||
csvEscapeField(event.referrer || ''),
|
||||
csvEscapeField(event.referrer_name || ''),
|
||||
csvEscapeField(event.referrer_type || ''),
|
||||
csvEscapeField(event.duration ?? 0),
|
||||
csvEscapeJson(event.properties),
|
||||
csvEscapeField(event.created_at),
|
||||
csvEscapeField(event.country || ''),
|
||||
csvEscapeField(event.city || ''),
|
||||
csvEscapeField(event.region || ''),
|
||||
csvEscapeField(event.longitude != null ? event.longitude : '\\N'),
|
||||
csvEscapeField(event.latitude != null ? event.latitude : '\\N'),
|
||||
csvEscapeField(event.os || ''),
|
||||
csvEscapeField(event.os_version || ''),
|
||||
csvEscapeField(event.browser || ''),
|
||||
csvEscapeField(event.browser_version || ''),
|
||||
csvEscapeField(event.device || ''),
|
||||
csvEscapeField(event.brand || ''),
|
||||
csvEscapeField(event.model || ''),
|
||||
csvEscapeField('\\N'), // imported_at (Nullable)
|
||||
csvEscapeField(importId),
|
||||
csvEscapeField('pending'), // import_status
|
||||
csvEscapeField(formatClickhouseDate(new Date())), // imported_at_meta (DateTime, not DateTime64, so no milliseconds)
|
||||
];
|
||||
return fields.join(',');
|
||||
});
|
||||
|
||||
await chInsertCSV(TABLE_NAMES.events_imports, csvRows);
|
||||
|
||||
return {
|
||||
importId,
|
||||
totalEvents: events.length,
|
||||
insertedEvents: events.length,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate deterministic session IDs for events that don't have them
|
||||
* Uses 30-minute time windows to create consistent session IDs across imports
|
||||
* Only processes events where device != 'server' and session_id = ''
|
||||
*/
|
||||
export async function generateSessionIds(
|
||||
importId: string,
|
||||
from: string,
|
||||
): Promise<void> {
|
||||
const rangeWhere = [
|
||||
'import_id = {importId:String}',
|
||||
"import_status = 'pending'",
|
||||
"device != 'server'",
|
||||
"session_id = ''",
|
||||
from ? 'toDate(created_at) = {from:String}' : '',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' AND ');
|
||||
|
||||
// Use SQL to generate deterministic session IDs based on device_id + 30-min time windows
|
||||
// This ensures same events always get same session IDs regardless of import order
|
||||
const updateQuery = `
|
||||
ALTER TABLE ${TABLE_NAMES.events_imports}
|
||||
UPDATE session_id = lower(hex(MD5(concat(
|
||||
device_id,
|
||||
'-',
|
||||
toString(toInt64(toUnixTimestamp(created_at) / 1800))
|
||||
))))
|
||||
WHERE ${rangeWhere}
|
||||
`;
|
||||
|
||||
await ch.command({
|
||||
query: updateQuery,
|
||||
query_params: { importId, from },
|
||||
clickhouse_settings: {
|
||||
wait_end_of_query: 1,
|
||||
mutations_sync: '2', // Wait for mutation to complete on all replicas (critical!)
|
||||
send_progress_in_http_headers: 1,
|
||||
http_headers_progress_interval_ms: '50000',
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstruct sessions using SQL-based logic
|
||||
* This identifies session boundaries and creates session_start/session_end events
|
||||
* session_start inherits all properties from the first event in the session
|
||||
* session_end inherits all properties from the last event in the session and calculates duration
|
||||
*/
|
||||
export async function createSessionsStartEndEvents(
|
||||
importId: string,
|
||||
from: string,
|
||||
): Promise<void> {
|
||||
// First, let's identify session boundaries and get first/last events for each session
|
||||
const rangeWhere = [
|
||||
'import_id = {importId:String}',
|
||||
"import_status = 'pending'",
|
||||
"session_id != ''", // Only process events that have session IDs
|
||||
'toDate(created_at) = {from:String}',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' AND ');
|
||||
|
||||
// Use window functions to efficiently get first event (all fields) and last event (only changing fields)
|
||||
// session_end only needs: properties, path, origin, created_at - the rest can be inherited from session_start
|
||||
const sessionEventsQuery = `
|
||||
SELECT
|
||||
device_id,
|
||||
session_id,
|
||||
project_id,
|
||||
profile_id,
|
||||
argMin((path, origin, referrer, referrer_name, referrer_type, properties, created_at, country, city, region, longitude, latitude, os, os_version, browser, browser_version, device, brand, model), created_at) AS first_event,
|
||||
argMax((path, origin, properties, created_at), created_at) AS last_event_fields,
|
||||
min(created_at) AS first_timestamp,
|
||||
max(created_at) AS last_timestamp
|
||||
FROM ${TABLE_NAMES.events_imports}
|
||||
WHERE ${rangeWhere}
|
||||
AND name NOT IN ('session_start', 'session_end')
|
||||
GROUP BY session_id, device_id, project_id, profile_id
|
||||
`;
|
||||
|
||||
const sessionEventsResult = await ch.query({
|
||||
query: sessionEventsQuery,
|
||||
query_params: { importId, from },
|
||||
format: 'JSONEachRow',
|
||||
});
|
||||
|
||||
const sessionData = (await sessionEventsResult.json()) as Array<{
|
||||
device_id: string;
|
||||
session_id: string;
|
||||
project_id: string;
|
||||
profile_id: string;
|
||||
first_event: [
|
||||
// string, // id
|
||||
// string, // name
|
||||
string, // path
|
||||
string, // origin
|
||||
string, // referrer
|
||||
string, // referrer_name
|
||||
string, // referrer_type
|
||||
// number, // duration
|
||||
Record<string, unknown>, // properties
|
||||
string, // created_at
|
||||
string, // country
|
||||
string, // city
|
||||
string, // region
|
||||
number | null, // longitude
|
||||
number | null, // latitude
|
||||
string, // os
|
||||
string, // os_version
|
||||
string, // browser
|
||||
string, // browser_version
|
||||
string, // device
|
||||
string, // brand
|
||||
string, // model
|
||||
// string, // sdk_name
|
||||
// string, // sdk_version
|
||||
// string, // imported_at
|
||||
];
|
||||
last_event_fields: [
|
||||
string, // path
|
||||
string, // origin
|
||||
Record<string, unknown>, // properties
|
||||
string, // created_at
|
||||
];
|
||||
first_timestamp: string;
|
||||
last_timestamp: string;
|
||||
}>;
|
||||
|
||||
// Create session_start and session_end events
|
||||
const sessionEvents: IClickhouseEvent[] = [];
|
||||
|
||||
for (const session of sessionData) {
|
||||
// Destructure first event tuple (all fields)
|
||||
const [
|
||||
// firstId,
|
||||
// firstName,
|
||||
firstPath,
|
||||
firstOrigin,
|
||||
firstReferrer,
|
||||
firstReferrerName,
|
||||
firstReferrerType,
|
||||
// firstDuration,
|
||||
firstProperties,
|
||||
firstCreatedAt,
|
||||
firstCountry,
|
||||
firstCity,
|
||||
firstRegion,
|
||||
firstLongitude,
|
||||
firstLatitude,
|
||||
firstOs,
|
||||
firstOsVersion,
|
||||
firstBrowser,
|
||||
firstBrowserVersion,
|
||||
firstDevice,
|
||||
firstBrand,
|
||||
firstModel,
|
||||
// firstSdkName,
|
||||
// firstSdkVersion,
|
||||
// firstImportedAt,
|
||||
] = session.first_event;
|
||||
|
||||
// Destructure last event fields (only the changing ones)
|
||||
const [lastPath, lastOrigin, lastProperties, lastCreatedAt] =
|
||||
session.last_event_fields;
|
||||
|
||||
// Calculate duration in milliseconds
|
||||
// Parse timestamps as Date objects to calculate duration
|
||||
const firstTime = new Date(session.first_timestamp).getTime();
|
||||
const lastTime = new Date(session.last_timestamp).getTime();
|
||||
const durationMs = lastTime - firstTime;
|
||||
|
||||
// Helper function to adjust timestamp by milliseconds without timezone conversion
|
||||
const adjustTimestamp = (timestamp: string, offsetMs: number): string => {
|
||||
// Parse the timestamp, adjust it, and format back to ClickHouse format
|
||||
const date = convertClickhouseDateToJs(timestamp);
|
||||
date.setTime(date.getTime() + offsetMs);
|
||||
return formatClickhouseDate(date);
|
||||
};
|
||||
|
||||
// Create session_start event - inherit everything from first event but change name
|
||||
// Set created_at to 1 second before the first event
|
||||
sessionEvents.push({
|
||||
id: crypto.randomUUID(),
|
||||
name: 'session_start',
|
||||
device_id: session.device_id,
|
||||
profile_id: session.profile_id,
|
||||
project_id: session.project_id,
|
||||
session_id: session.session_id,
|
||||
path: firstPath,
|
||||
origin: firstOrigin,
|
||||
referrer: firstReferrer,
|
||||
referrer_name: firstReferrerName,
|
||||
referrer_type: firstReferrerType,
|
||||
duration: 0, // session_start always has 0 duration
|
||||
properties: firstProperties as Record<
|
||||
string,
|
||||
string | number | boolean | null | undefined
|
||||
>,
|
||||
created_at: adjustTimestamp(session.first_timestamp, -1000), // 1 second before first event
|
||||
country: firstCountry,
|
||||
city: firstCity,
|
||||
region: firstRegion,
|
||||
longitude: firstLongitude,
|
||||
latitude: firstLatitude,
|
||||
os: firstOs,
|
||||
os_version: firstOsVersion,
|
||||
browser: firstBrowser,
|
||||
browser_version: firstBrowserVersion,
|
||||
device: firstDevice,
|
||||
brand: firstBrand,
|
||||
model: firstModel,
|
||||
imported_at: new Date().toISOString(),
|
||||
sdk_name: 'import-session-reconstruction',
|
||||
sdk_version: '1.0.0',
|
||||
});
|
||||
|
||||
// Create session_end event - inherit most from session_start, but use last event's path, origin, properties
|
||||
// Set created_at to 1 second after the last event
|
||||
sessionEvents.push({
|
||||
id: crypto.randomUUID(),
|
||||
name: 'session_end',
|
||||
device_id: session.device_id,
|
||||
profile_id: session.profile_id,
|
||||
project_id: session.project_id,
|
||||
session_id: session.session_id,
|
||||
path: lastPath, // From last event
|
||||
origin: lastOrigin, // From last event
|
||||
referrer: firstReferrer, // Same as session_start
|
||||
referrer_name: firstReferrerName, // Same as session_start
|
||||
referrer_type: firstReferrerType, // Same as session_start
|
||||
duration: durationMs,
|
||||
properties: lastProperties as Record<
|
||||
string,
|
||||
string | number | boolean | null | undefined
|
||||
>, // From last event
|
||||
created_at: adjustTimestamp(session.last_timestamp, 500), // 1 second after last event
|
||||
country: firstCountry, // Same as session_start
|
||||
city: firstCity, // Same as session_start
|
||||
region: firstRegion, // Same as session_start
|
||||
longitude: firstLongitude, // Same as session_start
|
||||
latitude: firstLatitude, // Same as session_start
|
||||
os: firstOs, // Same as session_start
|
||||
os_version: firstOsVersion, // Same as session_start
|
||||
browser: firstBrowser, // Same as session_start
|
||||
browser_version: firstBrowserVersion, // Same as session_start
|
||||
device: firstDevice, // Same as session_start
|
||||
brand: firstBrand, // Same as session_start
|
||||
model: firstModel, // Same as session_start
|
||||
imported_at: new Date().toISOString(),
|
||||
sdk_name: 'import-session-reconstruction',
|
||||
sdk_version: '1.0.0',
|
||||
});
|
||||
}
|
||||
|
||||
// Insert session events into imports table
|
||||
if (sessionEvents.length > 0) {
|
||||
await insertImportBatch(sessionEvents, importId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Migrate all events from imports table to production events table
|
||||
* This includes both original events and generated session events
|
||||
*/
|
||||
export async function moveImportsToProduction(
|
||||
importId: string,
|
||||
from: string,
|
||||
): Promise<void> {
|
||||
// Build the WHERE clause for migration
|
||||
// For session events (session_start/session_end), we don't filter by their created_at
|
||||
// because they're created with adjusted timestamps (±1 second) that might fall outside
|
||||
// the date range. Instead, we include them if their session_id has events in this range.
|
||||
let whereClause = 'import_id = {importId:String}';
|
||||
|
||||
if (from) {
|
||||
whereClause += ` AND (
|
||||
(toDate(created_at) = {from:String}) OR
|
||||
(
|
||||
name IN ('session_start', 'session_end') AND
|
||||
session_id IN (
|
||||
SELECT DISTINCT session_id
|
||||
FROM ${TABLE_NAMES.events_imports}
|
||||
WHERE import_id = {importId:String}
|
||||
AND toDate(created_at) = {from:String}
|
||||
AND name NOT IN ('session_start', 'session_end')
|
||||
)
|
||||
)
|
||||
)`;
|
||||
}
|
||||
|
||||
const migrationQuery = `
|
||||
INSERT INTO ${TABLE_NAMES.events} (
|
||||
id,
|
||||
name,
|
||||
sdk_name,
|
||||
sdk_version,
|
||||
device_id,
|
||||
profile_id,
|
||||
project_id,
|
||||
session_id,
|
||||
path,
|
||||
origin,
|
||||
referrer,
|
||||
referrer_name,
|
||||
referrer_type,
|
||||
duration,
|
||||
properties,
|
||||
created_at,
|
||||
country,
|
||||
city,
|
||||
region,
|
||||
longitude,
|
||||
latitude,
|
||||
os,
|
||||
os_version,
|
||||
browser,
|
||||
browser_version,
|
||||
device,
|
||||
brand,
|
||||
model,
|
||||
imported_at
|
||||
)
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
sdk_name,
|
||||
sdk_version,
|
||||
device_id,
|
||||
profile_id,
|
||||
project_id,
|
||||
session_id,
|
||||
path,
|
||||
origin,
|
||||
referrer,
|
||||
referrer_name,
|
||||
referrer_type,
|
||||
duration,
|
||||
properties,
|
||||
created_at,
|
||||
country,
|
||||
city,
|
||||
region,
|
||||
longitude,
|
||||
latitude,
|
||||
os,
|
||||
os_version,
|
||||
browser,
|
||||
browser_version,
|
||||
device,
|
||||
brand,
|
||||
model,
|
||||
imported_at
|
||||
FROM ${TABLE_NAMES.events_imports}
|
||||
WHERE ${whereClause}
|
||||
ORDER BY created_at ASC
|
||||
`;
|
||||
|
||||
await ch.command({
|
||||
query: migrationQuery,
|
||||
query_params: { importId, from },
|
||||
clickhouse_settings: {
|
||||
wait_end_of_query: 1,
|
||||
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
|
||||
send_progress_in_http_headers: 1,
|
||||
// The interval of sending these progress headers. Here it is less than 60s,
|
||||
http_headers_progress_interval_ms: '50000',
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export async function backfillSessionsToProduction(
|
||||
importId: string,
|
||||
from: string,
|
||||
): Promise<void> {
|
||||
// After migrating events, populate the sessions table based on the migrated sessions
|
||||
// We detect all session_ids involved in this import from the imports table,
|
||||
// then aggregate over the production events to construct session rows.
|
||||
const sessionsInsertQuery = `
|
||||
INSERT INTO ${TABLE_NAMES.sessions} (
|
||||
id,
|
||||
project_id,
|
||||
profile_id,
|
||||
device_id,
|
||||
created_at,
|
||||
ended_at,
|
||||
is_bounce,
|
||||
entry_origin,
|
||||
entry_path,
|
||||
exit_origin,
|
||||
exit_path,
|
||||
screen_view_count,
|
||||
revenue,
|
||||
event_count,
|
||||
duration,
|
||||
country,
|
||||
region,
|
||||
city,
|
||||
longitude,
|
||||
latitude,
|
||||
device,
|
||||
brand,
|
||||
model,
|
||||
browser,
|
||||
browser_version,
|
||||
os,
|
||||
os_version,
|
||||
sign,
|
||||
version,
|
||||
properties,
|
||||
utm_medium,
|
||||
utm_source,
|
||||
utm_campaign,
|
||||
utm_content,
|
||||
utm_term,
|
||||
referrer,
|
||||
referrer_name,
|
||||
referrer_type
|
||||
)
|
||||
SELECT
|
||||
any(e.session_id) as id,
|
||||
any(e.project_id) as project_id,
|
||||
if(any(nullIf(e.profile_id, e.device_id)) IS NULL, any(e.profile_id), any(nullIf(e.profile_id, e.device_id))) as profile_id,
|
||||
any(e.device_id) as device_id,
|
||||
argMin(e.created_at, e.created_at) as created_at,
|
||||
argMax(e.created_at, e.created_at) as ended_at,
|
||||
if(
|
||||
argMaxIf(e.properties['__bounce'], e.created_at, e.name = 'session_end') = '',
|
||||
if(countIf(e.name = 'screen_view') > 1, true, false),
|
||||
argMaxIf(e.properties['__bounce'], e.created_at, e.name = 'session_end') = 'true'
|
||||
) as is_bounce,
|
||||
argMinIf(e.origin, e.created_at, e.name = 'session_start') as entry_origin,
|
||||
argMinIf(e.path, e.created_at, e.name = 'session_start') as entry_path,
|
||||
argMaxIf(e.origin, e.created_at, e.name = 'session_end' OR e.name = 'screen_view') as exit_origin,
|
||||
argMaxIf(e.path, e.created_at, e.name = 'session_end' OR e.name = 'screen_view') as exit_path,
|
||||
countIf(e.name = 'screen_view') as screen_view_count,
|
||||
0 as revenue,
|
||||
countIf(e.name != 'screen_view' AND e.name != 'session_start' AND e.name != 'session_end') as event_count,
|
||||
sumIf(e.duration, name = 'session_end') AS duration,
|
||||
argMinIf(e.country, e.created_at, e.name = 'session_start') as country,
|
||||
argMinIf(e.region, e.created_at, e.name = 'session_start') as region,
|
||||
argMinIf(e.city, e.created_at, e.name = 'session_start') as city,
|
||||
argMinIf(e.longitude, e.created_at, e.name = 'session_start') as longitude,
|
||||
argMinIf(e.latitude, e.created_at, e.name = 'session_start') as latitude,
|
||||
argMinIf(e.device, e.created_at, e.name = 'session_start') as device,
|
||||
argMinIf(e.brand, e.created_at, e.name = 'session_start') as brand,
|
||||
argMinIf(e.model, e.created_at, e.name = 'session_start') as model,
|
||||
argMinIf(e.browser, e.created_at, e.name = 'session_start') as browser,
|
||||
argMinIf(e.browser_version, e.created_at, e.name = 'session_start') as browser_version,
|
||||
argMinIf(e.os, e.created_at, e.name = 'session_start') as os,
|
||||
argMinIf(e.os_version, e.created_at, e.name = 'session_start') as os_version,
|
||||
1 as sign,
|
||||
1 as version,
|
||||
argMinIf(e.properties, e.created_at, e.name = 'session_start') as properties,
|
||||
argMinIf(e.properties['__query.utm_medium'], e.created_at, e.name = 'session_start') as utm_medium,
|
||||
argMinIf(e.properties['__query.utm_source'], e.created_at, e.name = 'session_start') as utm_source,
|
||||
argMinIf(e.properties['__query.utm_campaign'], e.created_at, e.name = 'session_start') as utm_campaign,
|
||||
argMinIf(e.properties['__query.utm_content'], e.created_at, e.name = 'session_start') as utm_content,
|
||||
argMinIf(e.properties['__query.utm_term'], e.created_at, e.name = 'session_start') as utm_term,
|
||||
argMinIf(e.referrer, e.created_at, e.name = 'session_start') as referrer,
|
||||
argMinIf(e.referrer_name, e.created_at, e.name = 'session_start') as referrer_name,
|
||||
argMinIf(e.referrer_type, e.created_at, e.name = 'session_start') as referrer_type
|
||||
FROM ${TABLE_NAMES.events_imports} e
|
||||
WHERE
|
||||
e.import_id = ${sqlstring.escape(importId)}
|
||||
AND toDate(e.created_at) = ${sqlstring.escape(from)}
|
||||
AND e.session_id != ''
|
||||
GROUP BY e.session_id
|
||||
`;
|
||||
|
||||
await ch.command({
|
||||
query: sessionsInsertQuery,
|
||||
clickhouse_settings: {
|
||||
wait_end_of_query: 1,
|
||||
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
|
||||
send_progress_in_http_headers: 1,
|
||||
// The interval of sending these progress headers. Here it is less than 60s,
|
||||
http_headers_progress_interval_ms: '50000',
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark import as complete by updating status
|
||||
*/
|
||||
export async function markImportComplete(importId: string): Promise<void> {
|
||||
const updateQuery = `
|
||||
ALTER TABLE ${TABLE_NAMES.events_imports}
|
||||
UPDATE import_status = 'processed'
|
||||
WHERE import_id = {importId:String}
|
||||
`;
|
||||
|
||||
await ch.command({
|
||||
query: updateQuery,
|
||||
query_params: { importId },
|
||||
clickhouse_settings: {
|
||||
wait_end_of_query: 1,
|
||||
mutations_sync: '2', // Wait for mutation to complete
|
||||
// Ask ClickHouse to periodically send query execution progress in HTTP headers, creating some activity in the connection.
|
||||
send_progress_in_http_headers: 1,
|
||||
// The interval of sending these progress headers. Here it is less than 60s,
|
||||
http_headers_progress_interval_ms: '50000',
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get import progress and status
|
||||
*/
|
||||
export async function getImportProgress(
|
||||
importId: string,
|
||||
): Promise<ImportProgress> {
|
||||
const progressQuery = `
|
||||
SELECT
|
||||
import_id,
|
||||
COUNT(*) as total_events,
|
||||
COUNTIf(import_status = 'pending') as pending_events,
|
||||
COUNTIf(import_status = 'processed') as processed_events,
|
||||
any(import_status) as status
|
||||
FROM ${TABLE_NAMES.events_imports}
|
||||
WHERE import_id = {importId:String}
|
||||
AND name NOT IN ('session_start', 'session_end')
|
||||
GROUP BY import_id
|
||||
`;
|
||||
|
||||
const result = await ch.query({
|
||||
query: progressQuery,
|
||||
query_params: { importId },
|
||||
format: 'JSONEachRow',
|
||||
});
|
||||
|
||||
const data = (await result.json()) as Array<{
|
||||
import_id: string;
|
||||
total_events: number;
|
||||
pending_events: number;
|
||||
processed_events: number;
|
||||
status: string;
|
||||
}>;
|
||||
|
||||
if (data.length === 0) {
|
||||
return {
|
||||
importId,
|
||||
totalEvents: 0,
|
||||
insertedEvents: 0,
|
||||
status: 'pending',
|
||||
};
|
||||
}
|
||||
|
||||
const row = data[0];
|
||||
if (!row) {
|
||||
return {
|
||||
importId,
|
||||
totalEvents: 0,
|
||||
insertedEvents: 0,
|
||||
status: 'pending',
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
importId,
|
||||
totalEvents: row.total_events,
|
||||
insertedEvents: row.processed_events,
|
||||
status: row.status as 'pending' | 'processing' | 'processed' | 'failed',
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: get min/max created_at for an import
|
||||
*/
|
||||
export async function getImportDateBounds(
|
||||
importId: string,
|
||||
fromCreatedAt?: string,
|
||||
): Promise<{ min: string | null; max: string | null }> {
|
||||
const res = await ch.query({
|
||||
query: `
|
||||
SELECT min(created_at) AS min, max(created_at) AS max
|
||||
FROM ${TABLE_NAMES.events_imports}
|
||||
WHERE import_id = {importId:String}
|
||||
${fromCreatedAt ? 'AND created_at >= {fromCreatedAt:String}' : ''}
|
||||
`,
|
||||
query_params: { importId, fromCreatedAt },
|
||||
format: 'JSONEachRow',
|
||||
});
|
||||
const rows = (await res.json()) as Array<{
|
||||
min: string | null;
|
||||
max: string | null;
|
||||
}>;
|
||||
return rows.length > 0
|
||||
? {
|
||||
min: fromCreatedAt ?? rows[0]?.min ?? null,
|
||||
max: rows[0]?.max ?? null,
|
||||
}
|
||||
: { min: null, max: null };
|
||||
}
|
||||
|
||||
/**
|
||||
* Unified method to update all import status information
|
||||
* Combines step, batch, progress, and status message updates
|
||||
*/
|
||||
export type UpdateImportStatusOptions =
|
||||
| {
|
||||
step: 'loading';
|
||||
batch?: string;
|
||||
totalEvents?: number;
|
||||
processedEvents?: number;
|
||||
}
|
||||
| {
|
||||
step: 'generating_session_ids';
|
||||
batch?: string;
|
||||
}
|
||||
| {
|
||||
step: 'creating_sessions';
|
||||
batch?: string;
|
||||
}
|
||||
| {
|
||||
step: 'moving';
|
||||
batch?: string;
|
||||
}
|
||||
| {
|
||||
step: 'backfilling_sessions';
|
||||
batch?: string;
|
||||
}
|
||||
| {
|
||||
step: 'completed';
|
||||
}
|
||||
| {
|
||||
step: 'failed';
|
||||
errorMessage?: string;
|
||||
};
|
||||
|
||||
export type ImportSteps = UpdateImportStatusOptions['step'];
|
||||
|
||||
export async function updateImportStatus(
|
||||
jobLogger: ILogger,
|
||||
job: {
|
||||
updateProgress: (progress: Record<string, any>) => void;
|
||||
},
|
||||
importId: string,
|
||||
options: UpdateImportStatusOptions,
|
||||
): Promise<void> {
|
||||
const data: Prisma.ImportUpdateInput = {};
|
||||
switch (options.step) {
|
||||
case 'loading':
|
||||
data.status = 'processing';
|
||||
data.currentStep = 'loading';
|
||||
data.currentBatch = options.batch;
|
||||
data.statusMessage = options.batch
|
||||
? `Importing events from ${options.batch}`
|
||||
: 'Initializing...';
|
||||
data.totalEvents = options.totalEvents;
|
||||
data.processedEvents = options.processedEvents;
|
||||
break;
|
||||
case 'generating_session_ids':
|
||||
data.currentStep = 'generating_session_ids';
|
||||
data.currentBatch = options.batch;
|
||||
data.statusMessage = options.batch
|
||||
? `Generating session IDs for ${options.batch}`
|
||||
: 'Generating session IDs...';
|
||||
break;
|
||||
case 'creating_sessions':
|
||||
data.currentStep = 'creating_sessions';
|
||||
data.currentBatch = options.batch;
|
||||
data.statusMessage = `Creating sessions for ${options.batch}`;
|
||||
break;
|
||||
case 'moving':
|
||||
data.currentStep = 'moving';
|
||||
data.currentBatch = options.batch;
|
||||
data.statusMessage = `Moving imports to production for ${options.batch}`;
|
||||
break;
|
||||
case 'backfilling_sessions':
|
||||
data.currentStep = 'backfilling_sessions';
|
||||
data.currentBatch = options.batch;
|
||||
data.statusMessage = `Aggregating sessions for ${options.batch}`;
|
||||
break;
|
||||
case 'completed':
|
||||
data.status = 'completed';
|
||||
data.currentStep = 'completed';
|
||||
data.statusMessage = 'Import completed';
|
||||
data.completedAt = new Date();
|
||||
break;
|
||||
case 'failed':
|
||||
data.status = 'failed';
|
||||
data.statusMessage = 'Import failed';
|
||||
data.errorMessage = options.errorMessage;
|
||||
break;
|
||||
}
|
||||
|
||||
jobLogger.info('Import status update', data);
|
||||
|
||||
await job.updateProgress(data);
|
||||
|
||||
await db.import.update({
|
||||
where: { id: importId },
|
||||
data,
|
||||
});
|
||||
}
|
||||
@@ -196,7 +196,7 @@ export async function getSessionList({
|
||||
organization?.subscriptionPeriodEventsLimit &&
|
||||
organization?.subscriptionPeriodEventsLimit > 1_000_000
|
||||
? 1
|
||||
: 7;
|
||||
: 360;
|
||||
|
||||
if (cursor) {
|
||||
const cAt = sqlstring.escape(cursor.createdAt);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import type {
|
||||
IImportConfig,
|
||||
IIntegrationConfig,
|
||||
INotificationRuleConfig,
|
||||
IProjectFilters,
|
||||
@@ -12,6 +13,7 @@ import type { IClickhouseProfile } from './services/profile.service';
|
||||
|
||||
declare global {
|
||||
namespace PrismaJson {
|
||||
type IPrismaImportConfig = IImportConfig;
|
||||
type IPrismaNotificationRuleConfig = INotificationRuleConfig;
|
||||
type IPrismaIntegrationConfig = IIntegrationConfig;
|
||||
type IPrismaNotificationPayload = INotificationPayload;
|
||||
|
||||
35
packages/importer/package.json
Normal file
35
packages/importer/package.json
Normal file
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"name": "@openpanel/importer",
|
||||
"version": "0.0.0",
|
||||
"type": "module",
|
||||
"main": "src/index.ts",
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"dev": "tsc --watch",
|
||||
"test": "vitest",
|
||||
"test:run": "vitest run"
|
||||
},
|
||||
"exports": {
|
||||
".": "./src/index.ts",
|
||||
"./providers": "./src/providers/metadata.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@openpanel/common": "workspace:*",
|
||||
"@openpanel/db": "workspace:*",
|
||||
"@openpanel/queue": "workspace:*",
|
||||
"@openpanel/validation": "workspace:*",
|
||||
"csv-parse": "^6.1.0",
|
||||
"ramda": "^0.29.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "catalog:"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@openpanel/logger": "workspace:*",
|
||||
"@types/node": "^20.0.0",
|
||||
"@types/ramda": "^0.31.1",
|
||||
"@types/uuid": "^9.0.7",
|
||||
"bullmq": "^5.8.7",
|
||||
"typescript": "^5.0.0",
|
||||
"vitest": "^1.0.0"
|
||||
}
|
||||
}
|
||||
121
packages/importer/src/base-provider.ts
Normal file
121
packages/importer/src/base-provider.ts
Normal file
@@ -0,0 +1,121 @@
|
||||
import type { IClickhouseEvent } from '@openpanel/db';
|
||||
import type { BaseRawEvent, ErrorContext, ImportJobMetadata } from './types';
|
||||
|
||||
export abstract class BaseImportProvider<
|
||||
TRawEvent extends BaseRawEvent = BaseRawEvent,
|
||||
> {
|
||||
abstract provider: string;
|
||||
abstract version: string;
|
||||
|
||||
/**
|
||||
* Stream-read and parse source (file/API) → yields raw events
|
||||
* This should be implemented as an async generator to handle large files efficiently
|
||||
*/
|
||||
abstract parseSource(
|
||||
overrideFrom?: string,
|
||||
): AsyncGenerator<TRawEvent, void, unknown>;
|
||||
|
||||
/**
|
||||
* Convert provider format → IClickhouseEvent
|
||||
*/
|
||||
abstract transformEvent(rawEvent: TRawEvent): IClickhouseEvent;
|
||||
|
||||
/**
|
||||
* Validate raw event structure
|
||||
*/
|
||||
abstract validate(rawEvent: TRawEvent): boolean;
|
||||
|
||||
/**
|
||||
* Returns how many events will be imported
|
||||
*/
|
||||
abstract getTotalEventsCount(): Promise<number>;
|
||||
|
||||
/**
|
||||
* Optional hook: Pre-process batch
|
||||
*/
|
||||
async beforeBatch?(events: TRawEvent[]): Promise<TRawEvent[]> {
|
||||
return events;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optional hook: Get import metadata for tracking
|
||||
*/
|
||||
getImportMetadata?(): ImportJobMetadata;
|
||||
|
||||
/**
|
||||
* Optional hook: Custom error handling
|
||||
*/
|
||||
async onError?(error: Error, context?: ErrorContext): Promise<void> {
|
||||
// Default: re-throw
|
||||
throw error;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get estimated total events (optional, for progress tracking)
|
||||
*/
|
||||
async getEstimatedTotal?(): Promise<number> {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether session IDs should be generated in SQL after import
|
||||
* If true, the import job will generate deterministic session IDs based on
|
||||
* device_id and timestamp using SQL window functions
|
||||
* If false, assumes the provider already generates session IDs during streaming
|
||||
*/
|
||||
shouldGenerateSessionIds(): boolean {
|
||||
return false; // Default: assume provider handles it
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Split a date range into chunks to avoid timeout issues with large imports
|
||||
* Returns array of [from, to] date pairs in YYYY-MM-DD format
|
||||
*
|
||||
* @param from - Start date in YYYY-MM-DD format
|
||||
* @param to - End date in YYYY-MM-DD format
|
||||
* @param chunkSizeDays - Number of days per chunk (default: 1)
|
||||
*/
|
||||
public getDateChunks(
|
||||
from: string,
|
||||
to: string,
|
||||
options?: {
|
||||
chunkSizeDays?: number;
|
||||
},
|
||||
): Array<[string, string]> {
|
||||
const chunks: Array<[string, string]> = [];
|
||||
|
||||
const startDate = new Date(from);
|
||||
const endDate = new Date(to);
|
||||
const chunkSizeDays = options?.chunkSizeDays ?? 1;
|
||||
|
||||
// Handle case where from and to are the same date
|
||||
if (startDate.getTime() === endDate.getTime()) {
|
||||
return [[from, to]];
|
||||
}
|
||||
|
||||
const cursor = new Date(startDate);
|
||||
|
||||
while (cursor <= endDate) {
|
||||
const chunkStart = cursor.toISOString().split('T')[0]!;
|
||||
|
||||
// Calculate chunk end: move forward by (chunkSizeDays - 1) to get the last day of the chunk
|
||||
const chunkEndDate = new Date(cursor);
|
||||
chunkEndDate.setDate(chunkEndDate.getDate() + (chunkSizeDays - 1));
|
||||
|
||||
// Don't go past the end date
|
||||
const chunkEnd =
|
||||
chunkEndDate > endDate
|
||||
? endDate.toISOString().split('T')[0]!
|
||||
: chunkEndDate.toISOString().split('T')[0]!;
|
||||
|
||||
chunks.push([chunkStart, chunkEnd]);
|
||||
|
||||
// Move cursor to the next chunk start (after the current chunk)
|
||||
cursor.setDate(cursor.getDate() + chunkSizeDays);
|
||||
|
||||
if (cursor > endDate) break;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
13
packages/importer/src/index.ts
Normal file
13
packages/importer/src/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
export { UmamiProvider } from './providers/umami';
|
||||
export { MixpanelProvider } from './providers/mixpanel';
|
||||
export type {
|
||||
ImportConfig,
|
||||
ImportProgress,
|
||||
ImportResult,
|
||||
BatchResult,
|
||||
BaseRawEvent,
|
||||
ErrorContext,
|
||||
EventProperties,
|
||||
ImportJobMetadata,
|
||||
ImportStageResult,
|
||||
} from './types';
|
||||
30
packages/importer/src/providers/metadata.ts
Normal file
30
packages/importer/src/providers/metadata.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
export type ImportProviderId = 'umami' | 'mixpanel';
|
||||
export type ImportProviderType = 'file' | 'api';
|
||||
|
||||
export interface ImportProviderMeta {
|
||||
id: ImportProviderId;
|
||||
name: string;
|
||||
description: string;
|
||||
logo: string;
|
||||
backgroundColor: string;
|
||||
types: ImportProviderType[];
|
||||
}
|
||||
|
||||
export const IMPORT_PROVIDERS: ImportProviderMeta[] = [
|
||||
{
|
||||
id: 'umami',
|
||||
name: 'Umami',
|
||||
description: 'Import your analytics data from Umami',
|
||||
logo: 'https://cdn.brandfetch.io/id_3VEohOm/w/180/h/180/theme/dark/logo.png?c=1dxbfHSJFAPEGdCLU4o5B',
|
||||
backgroundColor: '#fff',
|
||||
types: ['file'],
|
||||
},
|
||||
{
|
||||
id: 'mixpanel',
|
||||
name: 'Mixpanel',
|
||||
description: 'Import your analytics data from Mixpanel API',
|
||||
logo: 'https://cdn.brandfetch.io/idr_rhI2FS/theme/dark/idMJ8uODLv.svg?c=1dxbfHSJFAPEGdCLU4o5B',
|
||||
backgroundColor: '#fff',
|
||||
types: ['api'],
|
||||
},
|
||||
];
|
||||
319
packages/importer/src/providers/mixpanel.test.ts
Normal file
319
packages/importer/src/providers/mixpanel.test.ts
Normal file
@@ -0,0 +1,319 @@
|
||||
import { omit } from 'ramda';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { MixpanelProvider } from './mixpanel';
|
||||
|
||||
describe('mixpanel', () => {
|
||||
it('should chunk date range into day chunks', async () => {
|
||||
const provider = new MixpanelProvider('pid', {
|
||||
from: '2025-01-01',
|
||||
to: '2025-01-04',
|
||||
serviceAccount: 'sa',
|
||||
serviceSecret: 'ss',
|
||||
projectId: '123',
|
||||
provider: 'mixpanel',
|
||||
type: 'api',
|
||||
mapScreenViewProperty: undefined,
|
||||
});
|
||||
|
||||
const chunks = provider.getDateChunks('2025-01-01', '2025-01-04');
|
||||
expect(chunks).toEqual([
|
||||
['2025-01-01', '2025-01-01'],
|
||||
['2025-01-02', '2025-01-02'],
|
||||
['2025-01-03', '2025-01-03'],
|
||||
['2025-01-04', '2025-01-04'],
|
||||
]);
|
||||
});
|
||||
|
||||
it('should transform event', async () => {
|
||||
const provider = new MixpanelProvider('pid', {
|
||||
from: '2025-01-01',
|
||||
to: '2025-01-02',
|
||||
serviceAccount: 'sa',
|
||||
serviceSecret: 'ss',
|
||||
projectId: '123',
|
||||
provider: 'mixpanel',
|
||||
type: 'api',
|
||||
mapScreenViewProperty: undefined,
|
||||
});
|
||||
|
||||
const rawEvent = {
|
||||
event: '$mp_web_page_view',
|
||||
properties: {
|
||||
time: 1746097970,
|
||||
distinct_id: '$device:123',
|
||||
$browser: 'Chrome',
|
||||
$browser_version: 135,
|
||||
$city: 'Mumbai',
|
||||
$current_url:
|
||||
'https://domain.com/state/maharashtra?utm_source=google&utm_medium=cpc&utm_campaignid=890&utm_adgroupid=&utm_adid=&utm_term=&utm_device=m&utm_network=x&utm_location=123&gclid=oqneoqow&gad_sour',
|
||||
$device: 'Android',
|
||||
$device_id: '123',
|
||||
$initial_referrer: 'https://referrer.com/',
|
||||
$initial_referring_domain: 'referrer.com',
|
||||
$insert_id: 'source_id',
|
||||
$lib_version: '2.60.0',
|
||||
$mp_api_endpoint: 'api-js.mixpanel.com',
|
||||
$mp_api_timestamp_ms: 1746078175363,
|
||||
$mp_autocapture: true,
|
||||
$os: 'Android',
|
||||
$referrer: 'https://google.com/',
|
||||
$referring_domain: 'referrer.com',
|
||||
$region: 'Maharashtra',
|
||||
$screen_height: 854,
|
||||
$screen_width: 384,
|
||||
current_domain: 'domain.com',
|
||||
current_page_title:
|
||||
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
|
||||
current_url_path: '/state/maharashtra',
|
||||
current_url_protocol: 'https:',
|
||||
current_url_search:
|
||||
'?utm_source=google&utm_medium=cpc&utm_campaignid=890&utm_adgroupid=&utm_adid=&utm_term=&utm_device=m&utm_network=x&utm_location=123&gclid=oqneoqow&gad_source=5&gclid=EAIaIQobChMI6MnvhciBjQMVlS-DAx',
|
||||
gclid: 'oqneoqow',
|
||||
mp_country_code: 'IN',
|
||||
mp_lib: 'web',
|
||||
mp_processing_time_ms: 1746078175546,
|
||||
mp_sent_by_lib_version: '2.60.0',
|
||||
utm_medium: 'cpc',
|
||||
utm_source: 'google',
|
||||
},
|
||||
};
|
||||
|
||||
const res = provider.transformEvent(rawEvent);
|
||||
|
||||
expect(res).toMatchObject({
|
||||
id: expect.any(String),
|
||||
name: 'screen_view',
|
||||
device_id: '123',
|
||||
profile_id: '123',
|
||||
project_id: 'pid',
|
||||
session_id: '',
|
||||
properties: {
|
||||
__source_insert_id: 'source_id',
|
||||
__screen: '384x854',
|
||||
__lib_version: '2.60.0',
|
||||
'__query.utm_source': 'google',
|
||||
'__query.utm_medium': 'cpc',
|
||||
'__query.utm_campaignid': '890',
|
||||
'__query.utm_device': 'm',
|
||||
'__query.utm_network': 'x',
|
||||
'__query.utm_location': '123',
|
||||
'__query.gclid': 'oqneoqow',
|
||||
__title:
|
||||
'Landeed: Satbara Utara, 7/12 Extract, Property Card & Index 2',
|
||||
},
|
||||
created_at: '2025-05-01T11:12:50.000Z',
|
||||
country: 'IN',
|
||||
city: 'Mumbai',
|
||||
region: 'Maharashtra',
|
||||
longitude: null,
|
||||
latitude: null,
|
||||
os: 'Android',
|
||||
os_version: undefined,
|
||||
browser: 'Chrome',
|
||||
browser_version: '',
|
||||
device: 'mobile',
|
||||
brand: '',
|
||||
model: '',
|
||||
duration: 0,
|
||||
path: '/state/maharashtra',
|
||||
origin: 'https://domain.com',
|
||||
referrer: 'https://referrer.com',
|
||||
referrer_name: 'Google',
|
||||
referrer_type: 'search',
|
||||
imported_at: expect.any(String),
|
||||
sdk_name: 'mixpanel (web)',
|
||||
sdk_version: '1.0.0',
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse stringified JSON in properties and flatten them', async () => {
|
||||
const provider = new MixpanelProvider('pid', {
|
||||
from: '2025-01-01',
|
||||
to: '2025-01-02',
|
||||
serviceAccount: 'sa',
|
||||
serviceSecret: 'ss',
|
||||
projectId: '123',
|
||||
provider: 'mixpanel',
|
||||
type: 'api',
|
||||
mapScreenViewProperty: undefined,
|
||||
});
|
||||
|
||||
const rawEvent = {
|
||||
event: 'custom_event',
|
||||
properties: {
|
||||
time: 1746097970,
|
||||
distinct_id: '$device:123',
|
||||
$device_id: '123',
|
||||
$user_id: 'user123',
|
||||
mp_lib: 'web',
|
||||
// Stringified JSON object - should be parsed and flattened
|
||||
area: '{"displayText":"Malab, Nuh, Mewat","id":1189005}',
|
||||
// Stringified JSON array - should be parsed and flattened
|
||||
tags: '["tag1","tag2","tag3"]',
|
||||
// Regular string - should remain as is
|
||||
regularString: 'just a string',
|
||||
// Number - should be converted to string
|
||||
count: 42,
|
||||
// Object - should be flattened
|
||||
nested: { level1: { level2: 'value' } },
|
||||
},
|
||||
};
|
||||
|
||||
const res = provider.transformEvent(rawEvent);
|
||||
|
||||
expect(res.properties).toMatchObject({
|
||||
// Parsed JSON object should be flattened with dot notation
|
||||
'area.displayText': 'Malab, Nuh, Mewat',
|
||||
'area.id': '1189005',
|
||||
// Parsed JSON array should be flattened with numeric indices
|
||||
'tags.0': 'tag1',
|
||||
'tags.1': 'tag2',
|
||||
'tags.2': 'tag3',
|
||||
// Regular values
|
||||
regularString: 'just a string',
|
||||
count: '42',
|
||||
// Nested object flattened
|
||||
'nested.level1.level2': 'value',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle react-native referrer', async () => {
|
||||
const provider = new MixpanelProvider('pid', {
|
||||
from: '2025-01-01',
|
||||
to: '2025-01-02',
|
||||
serviceAccount: 'sa',
|
||||
serviceSecret: 'ss',
|
||||
projectId: '123',
|
||||
provider: 'mixpanel',
|
||||
type: 'api',
|
||||
mapScreenViewProperty: undefined,
|
||||
});
|
||||
|
||||
const rawEvent = {
|
||||
event: 'ec_search_error',
|
||||
properties: {
|
||||
time: 1759947367,
|
||||
distinct_id: '3385916',
|
||||
$browser: 'Mobile Safari',
|
||||
$browser_version: null,
|
||||
$city: 'Bengaluru',
|
||||
$current_url:
|
||||
'https://web.landeed.com/karnataka/ec-encumbrance-certificate',
|
||||
$device: 'iPhone',
|
||||
$device_id:
|
||||
'199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
|
||||
$initial_referrer: 'https://www.google.com/',
|
||||
$initial_referring_domain: 'www.google.com',
|
||||
$insert_id: 'bclkaepeqcfuzt4v',
|
||||
$lib_version: '2.60.0',
|
||||
$mp_api_endpoint: 'api-js.mixpanel.com',
|
||||
$mp_api_timestamp_ms: 1759927570699,
|
||||
$os: 'iOS',
|
||||
$region: 'Karnataka',
|
||||
$screen_height: 852,
|
||||
$screen_width: 393,
|
||||
$search_engine: 'google',
|
||||
$user_id: '3385916',
|
||||
binaryReadableVersion: 'NA',
|
||||
binaryVersion: 'NA',
|
||||
component: '/karnataka/ec-encumbrance-certificate',
|
||||
errMsg: 'Request failed with status code 500',
|
||||
errType: 'SERVER_ERROR',
|
||||
isSilentSearch: false,
|
||||
isTimeout: false,
|
||||
jsVersion: '0.42.0',
|
||||
language: 'english',
|
||||
mp_country_code: 'IN',
|
||||
mp_lib: 'web',
|
||||
mp_processing_time_ms: 1759927592421,
|
||||
mp_sent_by_lib_version: '2.60.0',
|
||||
os: 'web',
|
||||
osVersion:
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
|
||||
phoneBrand: 'NA',
|
||||
phoneManufacturer: 'NA',
|
||||
phoneModel: 'NA',
|
||||
searchUuid: '68e65d08-fd81-4ded-37d3-2b08d2bc70c3',
|
||||
serverVersion: 'web2.0',
|
||||
state: 17,
|
||||
stateStr: '17',
|
||||
statusCode: 500,
|
||||
type: 'result_event',
|
||||
utm_medium: 'cpc',
|
||||
utm_source:
|
||||
'google%26utm_medium=cpc%26utm_campaignid=21380769590%26utm_adgroupid=%26utm_adid=%26utm_term=%26utm_device=m%26utm_network=%26utm_location=9062055%26gclid=%26gad_campaignid=21374496705%26gbraid=0AAAAAoV7mTM9mWFripzQ2Od0xXAfrW6p3%26wbraid=CmAKCQjwi4PHBhCUA',
|
||||
},
|
||||
};
|
||||
|
||||
const res = provider.transformEvent(rawEvent);
|
||||
|
||||
expect(res.id.length).toBeGreaterThan(30);
|
||||
expect(res.imported_at).toMatch(
|
||||
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
|
||||
);
|
||||
expect(omit(['id', 'imported_at'], res)).toEqual({
|
||||
brand: 'Apple',
|
||||
browser: 'GSA',
|
||||
browser_version: 'null',
|
||||
city: 'Bengaluru',
|
||||
country: 'IN',
|
||||
created_at: '2025-10-08T18:16:07.000Z',
|
||||
device: 'mobile',
|
||||
device_id: '199b498af1036c-0e943279a1292e-5c0f4368-51bf4-199b498af1036c',
|
||||
duration: 0,
|
||||
latitude: null,
|
||||
longitude: null,
|
||||
model: 'iPhone',
|
||||
name: 'ec_search_error',
|
||||
origin: 'https://web.landeed.com',
|
||||
os: 'iOS',
|
||||
os_version: '18.7.0',
|
||||
path: '/karnataka/ec-encumbrance-certificate',
|
||||
profile_id: '3385916',
|
||||
project_id: 'pid',
|
||||
properties: {
|
||||
__lib_version: '2.60.0',
|
||||
'__query.gad_campaignid': '21374496705',
|
||||
'__query.gbraid': '0AAAAAoV7mTM9mWFripzQ2Od0xXAfrW6p3',
|
||||
'__query.utm_campaignid': '21380769590',
|
||||
'__query.utm_device': 'm',
|
||||
'__query.utm_location': '9062055',
|
||||
'__query.utm_medium': 'cpc',
|
||||
'__query.utm_source': 'google',
|
||||
'__query.wbraid': 'CmAKCQjwi4PHBhCUA',
|
||||
__screen: '393x852',
|
||||
__source_insert_id: 'bclkaepeqcfuzt4v',
|
||||
__userAgent:
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
|
||||
binaryReadableVersion: 'NA',
|
||||
binaryVersion: 'NA',
|
||||
component: '/karnataka/ec-encumbrance-certificate',
|
||||
errMsg: 'Request failed with status code 500',
|
||||
errType: 'SERVER_ERROR',
|
||||
isSilentSearch: 'false',
|
||||
isTimeout: 'false',
|
||||
jsVersion: '0.42.0',
|
||||
language: 'english',
|
||||
os: 'web',
|
||||
osVersion:
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_7_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/388.0.811331708 Mobile/15E148 Safari/604.1',
|
||||
phoneBrand: 'NA',
|
||||
phoneManufacturer: 'NA',
|
||||
phoneModel: 'NA',
|
||||
searchUuid: '68e65d08-fd81-4ded-37d3-2b08d2bc70c3',
|
||||
serverVersion: 'web2.0',
|
||||
state: '17',
|
||||
stateStr: '17',
|
||||
statusCode: '500',
|
||||
type: 'result_event',
|
||||
},
|
||||
referrer: 'https://www.google.com',
|
||||
referrer_name: 'Google',
|
||||
referrer_type: 'search',
|
||||
region: 'Karnataka',
|
||||
sdk_name: 'mixpanel (web)',
|
||||
sdk_version: '1.0.0',
|
||||
session_id: '',
|
||||
});
|
||||
});
|
||||
});
|
||||
452
packages/importer/src/providers/mixpanel.ts
Normal file
452
packages/importer/src/providers/mixpanel.ts
Normal file
@@ -0,0 +1,452 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import { isSameDomain, parsePath, toDots } from '@openpanel/common';
|
||||
import { type UserAgentInfo, parseUserAgent } from '@openpanel/common/server';
|
||||
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
|
||||
import type { IClickhouseEvent } from '@openpanel/db';
|
||||
import type { ILogger } from '@openpanel/logger';
|
||||
import type { IMixpanelImportConfig } from '@openpanel/validation';
|
||||
import { z } from 'zod';
|
||||
import { BaseImportProvider } from '../base-provider';
|
||||
|
||||
export const zMixpanelRawEvent = z.object({
|
||||
event: z.string(),
|
||||
properties: z.record(z.unknown()),
|
||||
});
|
||||
|
||||
export type MixpanelRawEvent = z.infer<typeof zMixpanelRawEvent>;
|
||||
|
||||
export class MixpanelProvider extends BaseImportProvider<MixpanelRawEvent> {
|
||||
provider = 'mixpanel';
|
||||
version = '1.0.0';
|
||||
|
||||
constructor(
|
||||
private readonly projectId: string,
|
||||
private readonly config: IMixpanelImportConfig,
|
||||
private readonly logger?: ILogger,
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
async getTotalEventsCount(): Promise<number> {
|
||||
// Mixpanel sucks and dont provide a good way to extract total event count within a period
|
||||
// jql would work but not accurate and will be deprecated end of 2025
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mixpanel doesn't provide session IDs, so we need to generate them in SQL
|
||||
* after all events are imported to ensure deterministic results
|
||||
*/
|
||||
shouldGenerateSessionIds(): boolean {
|
||||
return true;
|
||||
}
|
||||
|
||||
async *parseSource(
|
||||
overrideFrom?: string,
|
||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||
yield* this.fetchEventsFromMixpanel(overrideFrom);
|
||||
}
|
||||
|
||||
private async *fetchEventsFromMixpanel(
|
||||
overrideFrom?: string,
|
||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||
const { serviceAccount, serviceSecret, projectId, from, to } = this.config;
|
||||
|
||||
// Split the date range into monthly chunks for reliability
|
||||
// Uses base class utility to avoid timeout issues with large date ranges
|
||||
const dateChunks = this.getDateChunks(overrideFrom ?? from, to); // 1 month per chunk
|
||||
|
||||
for (const [chunkFrom, chunkTo] of dateChunks) {
|
||||
yield* this.fetchEventsForDateRange(
|
||||
serviceAccount,
|
||||
serviceSecret,
|
||||
projectId,
|
||||
chunkFrom,
|
||||
chunkTo,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private async *fetchEventsForDateRange(
|
||||
serviceAccount: string,
|
||||
serviceSecret: string,
|
||||
projectId: string,
|
||||
from: string,
|
||||
to: string,
|
||||
): AsyncGenerator<MixpanelRawEvent, void, unknown> {
|
||||
const url = 'https://data.mixpanel.com/api/2.0/export';
|
||||
|
||||
const params = new URLSearchParams({
|
||||
from_date: from,
|
||||
to_date: to,
|
||||
project_id: projectId,
|
||||
});
|
||||
|
||||
this.logger?.info('Fetching events from Mixpanel', {
|
||||
url: `${url}?${params}`,
|
||||
from,
|
||||
to,
|
||||
projectId,
|
||||
serviceAccount,
|
||||
});
|
||||
|
||||
const response = await fetch(`${url}?${params}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${serviceAccount}:${serviceSecret}`).toString('base64')}`,
|
||||
Accept: 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`Failed to fetch events from Mixpanel: ${response.status} ${response.statusText}`,
|
||||
);
|
||||
}
|
||||
|
||||
if (!response.body) {
|
||||
throw new Error('No response body from Mixpanel API');
|
||||
}
|
||||
|
||||
// Stream the response line by line
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
|
||||
if (done) break;
|
||||
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
|
||||
// Process complete lines
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || ''; // Keep the last incomplete line in buffer
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.trim()) {
|
||||
try {
|
||||
const event = JSON.parse(line);
|
||||
yield event;
|
||||
} catch (error) {
|
||||
console.warn('Failed to parse Mixpanel event:', line);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process any remaining line in buffer
|
||||
if (buffer.trim()) {
|
||||
try {
|
||||
const event = JSON.parse(buffer);
|
||||
yield event;
|
||||
} catch (error) {
|
||||
console.warn('Failed to parse final Mixpanel event:', buffer);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
reader.releaseLock();
|
||||
}
|
||||
}
|
||||
|
||||
validate(rawEvent: MixpanelRawEvent): boolean {
|
||||
const res = zMixpanelRawEvent.safeParse(rawEvent);
|
||||
return res.success;
|
||||
}
|
||||
|
||||
transformEvent(_rawEvent: MixpanelRawEvent): IClickhouseEvent {
|
||||
const projectId = this.projectId;
|
||||
const rawEvent = zMixpanelRawEvent.parse(_rawEvent);
|
||||
const props = rawEvent.properties as Record<string, any>;
|
||||
const deviceId = props.$device_id;
|
||||
const profileId = String(props.$user_id || props.distinct_id).replace(
|
||||
/^\$device:/,
|
||||
'',
|
||||
);
|
||||
|
||||
// Build full URL from current_url and current_url_search (web only)
|
||||
const fullUrl = props.$current_url;
|
||||
let path = '';
|
||||
let origin = '';
|
||||
let hash = '';
|
||||
let query: Record<string, string> = {};
|
||||
|
||||
if (fullUrl) {
|
||||
const parsed = parsePath(fullUrl);
|
||||
path = parsed.path || '';
|
||||
origin = parsed.origin || '';
|
||||
hash = parsed.hash || '';
|
||||
query = parsed.query || {};
|
||||
} else if (this.config.mapScreenViewProperty) {
|
||||
path = props[this.config.mapScreenViewProperty] || '';
|
||||
}
|
||||
|
||||
// Extract referrer information (web only)
|
||||
const referrerUrl = props.$initial_referrer || props.$referrer || '';
|
||||
const referrer =
|
||||
referrerUrl && !isSameDomain(referrerUrl, fullUrl)
|
||||
? parseReferrer(referrerUrl)
|
||||
: null;
|
||||
|
||||
// Check for UTM referrer in query params (web only)
|
||||
const utmReferrer = getReferrerWithQuery(query);
|
||||
|
||||
// Extract location data
|
||||
const country = props.$country || props.mp_country_code || '';
|
||||
const city = props.$city || '';
|
||||
const region = props.$region || '';
|
||||
|
||||
// For web events, use the standard user agent parsing
|
||||
const userAgent = props.osVersion || '';
|
||||
const uaInfo = this.isWebEvent(props.mp_lib)
|
||||
? parseUserAgent(userAgent, props)
|
||||
: this.parseServerDeviceInfo(props);
|
||||
|
||||
// Map event name - $mp_web_page_view should be screen_view
|
||||
let eventName = rawEvent.event;
|
||||
if (eventName === '$mp_web_page_view') {
|
||||
eventName = 'screen_view';
|
||||
}
|
||||
|
||||
// Build properties object - strip Mixpanel-specific properties
|
||||
const properties = this.stripMixpanelProperties(props, query);
|
||||
|
||||
if (props.$insert_id) {
|
||||
properties.__source_insert_id = String(props.$insert_id);
|
||||
}
|
||||
// Add useful properties
|
||||
if (props.$screen_width && props.$screen_height) {
|
||||
properties.__screen = `${props.$screen_width}x${props.$screen_height}`;
|
||||
}
|
||||
if (props.$screen_dpi) {
|
||||
properties.__dpi = props.$screen_dpi;
|
||||
}
|
||||
if (props.$language) {
|
||||
properties.__language = props.$language;
|
||||
}
|
||||
if (props.$timezone) {
|
||||
properties.__timezone = props.$timezone;
|
||||
}
|
||||
if (props.$app_version) {
|
||||
properties.__version = props.$app_version;
|
||||
}
|
||||
if (props.$app_build_number) {
|
||||
properties.__buildNumber = props.$app_build_number;
|
||||
}
|
||||
if (props.$lib_version) {
|
||||
properties.__lib_version = props.$lib_version;
|
||||
}
|
||||
|
||||
if (hash) {
|
||||
properties.__hash = hash;
|
||||
}
|
||||
|
||||
if (Object.keys(query).length > 0) {
|
||||
properties.__query = query;
|
||||
}
|
||||
|
||||
if (props.current_page_title) {
|
||||
properties.__title = props.current_page_title;
|
||||
}
|
||||
|
||||
if (userAgent) {
|
||||
properties.__userAgent = userAgent;
|
||||
}
|
||||
|
||||
// Always use UUID for id to match ClickHouse UUID column
|
||||
const event = {
|
||||
id: randomUUID(),
|
||||
name: eventName,
|
||||
device_id: deviceId,
|
||||
profile_id: profileId,
|
||||
project_id: projectId,
|
||||
session_id: '', // Will be generated in SQL after import
|
||||
properties: toDots(properties), // Flatten nested objects/arrays to Map(String, String)
|
||||
created_at: new Date(props.time * 1000).toISOString(),
|
||||
country,
|
||||
city,
|
||||
region,
|
||||
longitude: null,
|
||||
latitude: null,
|
||||
os: uaInfo.os || props.$os,
|
||||
os_version: uaInfo.osVersion || props.$osVersion,
|
||||
browser: uaInfo.browser || props.$browser,
|
||||
browser_version:
|
||||
uaInfo.browserVersion || props.$browserVersion
|
||||
? String(props.$browser_version)
|
||||
: '',
|
||||
device: this.getDeviceType(props.mp_lib, uaInfo, props),
|
||||
brand: uaInfo.brand || '',
|
||||
model: uaInfo.model || '',
|
||||
duration: 0,
|
||||
path,
|
||||
origin,
|
||||
referrer: referrer?.url || '',
|
||||
referrer_name: utmReferrer?.name || referrer?.name || '',
|
||||
referrer_type: referrer?.type || utmReferrer?.type || '',
|
||||
imported_at: new Date().toISOString(),
|
||||
sdk_name: props.mp_lib
|
||||
? `${this.provider} (${props.mp_lib})`
|
||||
: this.provider,
|
||||
sdk_version: this.version,
|
||||
};
|
||||
|
||||
// TODO: Remove this
|
||||
// Temporary fix for a client
|
||||
const isMightBeScreenView = this.getMightBeScreenView(rawEvent);
|
||||
if (isMightBeScreenView && event.name === 'Loaded a Screen') {
|
||||
event.name = 'screen_view';
|
||||
event.path = isMightBeScreenView;
|
||||
}
|
||||
|
||||
// TODO: Remove this
|
||||
// This is a hack to get utm tags (not sure if this is just the testing project or all mixpanel projects)
|
||||
if (props.utm_source && !properties.__query?.utm_source) {
|
||||
const split = decodeURIComponent(props.utm_source).split('&');
|
||||
const query = Object.fromEntries(split.map((item) => item.split('=')));
|
||||
for (const [key, value] of Object.entries(query)) {
|
||||
if (key && value) {
|
||||
event.properties[`__query.${key}`] = String(value);
|
||||
} else if (
|
||||
value === undefined &&
|
||||
key &&
|
||||
props.utm_source.startsWith(key)
|
||||
) {
|
||||
event.properties['__query.utm_source'] = String(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return event;
|
||||
}
|
||||
|
||||
private getDeviceType(
|
||||
mp_lib: string,
|
||||
uaInfo: UserAgentInfo,
|
||||
props: Record<string, any>,
|
||||
) {
|
||||
// Normalize lib/os/browser data
|
||||
const lib = (mp_lib || '').toLowerCase();
|
||||
const os = String(props.$os || uaInfo.os || '').toLowerCase();
|
||||
const browser = String(
|
||||
props.$browser || uaInfo.browser || '',
|
||||
).toLowerCase();
|
||||
|
||||
const isTabletOs = os === 'ipados' || os === 'ipad os' || os === 'ipad';
|
||||
|
||||
// Strong hint from SDK library
|
||||
if (['android', 'iphone', 'react-native', 'swift', 'unity'].includes(lib)) {
|
||||
return isTabletOs ? 'tablet' : 'mobile';
|
||||
}
|
||||
|
||||
// Web or unknown SDKs: infer from OS/Browser
|
||||
const isMobileSignal =
|
||||
os === 'ios' ||
|
||||
os === 'android' ||
|
||||
browser.includes('mobile safari') ||
|
||||
browser.includes('chrome ios') ||
|
||||
browser.includes('android mobile') ||
|
||||
browser.includes('samsung internet') ||
|
||||
browser.includes('mobile');
|
||||
|
||||
if (isMobileSignal) {
|
||||
return 'mobile';
|
||||
}
|
||||
|
||||
const isTabletSignal =
|
||||
isTabletOs ||
|
||||
browser.includes('tablet') ||
|
||||
// iPad often reports as Mac OS X with Mobile Safari
|
||||
(browser.includes('mobile safari') &&
|
||||
(os === 'mac os x' || os === 'macos'));
|
||||
|
||||
if (isTabletSignal) {
|
||||
return 'tablet';
|
||||
}
|
||||
|
||||
// Default to desktop
|
||||
return this.isServerEvent(mp_lib) ? 'server' : 'desktop';
|
||||
}
|
||||
|
||||
private isWebEvent(mp_lib: string) {
|
||||
return [
|
||||
'web',
|
||||
'android',
|
||||
'iphone',
|
||||
'swift',
|
||||
'unity',
|
||||
'react-native',
|
||||
].includes(mp_lib);
|
||||
}
|
||||
|
||||
private isServerEvent(mp_lib: string) {
|
||||
return !this.isWebEvent(mp_lib);
|
||||
}
|
||||
|
||||
private getMightBeScreenView(rawEvent: MixpanelRawEvent) {
|
||||
const props = rawEvent.properties as Record<string, any>;
|
||||
return Object.keys(props).find((key) => key.match(/^[A-Z1-9_]+$/));
|
||||
}
|
||||
|
||||
private parseServerDeviceInfo(props: Record<string, any>): UserAgentInfo {
|
||||
// For mobile events, extract device information from Mixpanel properties
|
||||
const os = props.$os || props.os || '';
|
||||
const osVersion = props.$os_version || props.osVersion || '';
|
||||
const brand = props.$brand || props.phoneBrand || '';
|
||||
const model = props.$model || props.phoneModel || '';
|
||||
const device = os.toLowerCase();
|
||||
|
||||
return {
|
||||
isServer: true,
|
||||
os: os,
|
||||
osVersion: osVersion,
|
||||
browser: '',
|
||||
browserVersion: '',
|
||||
device: device,
|
||||
brand: brand,
|
||||
model: model,
|
||||
};
|
||||
}
|
||||
|
||||
private stripMixpanelProperties(
|
||||
properties: Record<string, any>,
|
||||
searchParams: Record<string, string>,
|
||||
): Record<string, any> {
|
||||
const strip = [
|
||||
'time',
|
||||
'distinct_id',
|
||||
'current_page_title',
|
||||
'current_url_path',
|
||||
'current_url_protocol',
|
||||
'current_url_search',
|
||||
'current_domain',
|
||||
...Object.keys(searchParams),
|
||||
];
|
||||
const filtered = Object.fromEntries(
|
||||
Object.entries(properties).filter(
|
||||
([key]) => !key.match(/^(\$|mp_|utm_)/) && !strip.includes(key),
|
||||
),
|
||||
);
|
||||
|
||||
// Parse JSON strings back to objects/arrays so toDots() can flatten them
|
||||
const parsed: Record<string, any> = {};
|
||||
for (const [key, value] of Object.entries(filtered)) {
|
||||
if (
|
||||
typeof value === 'string' &&
|
||||
(value.startsWith('{') || value.startsWith('['))
|
||||
) {
|
||||
try {
|
||||
parsed[key] = JSON.parse(value);
|
||||
} catch {
|
||||
parsed[key] = value; // Keep as string if parsing fails
|
||||
}
|
||||
} else {
|
||||
parsed[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
382
packages/importer/src/providers/umami.ts
Normal file
382
packages/importer/src/providers/umami.ts
Normal file
@@ -0,0 +1,382 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import { Readable } from 'node:stream';
|
||||
import { pipeline } from 'node:stream/promises';
|
||||
import { createBrotliDecompress, createGunzip } from 'node:zlib';
|
||||
import { isSameDomain, parsePath } from '@openpanel/common';
|
||||
import { generateDeviceId } from '@openpanel/common/server';
|
||||
import { getReferrerWithQuery, parseReferrer } from '@openpanel/common/server';
|
||||
import type { IClickhouseEvent } from '@openpanel/db';
|
||||
import type { ILogger } from '@openpanel/logger';
|
||||
import type { IUmamiImportConfig } from '@openpanel/validation';
|
||||
import { parse } from 'csv-parse';
|
||||
import { assocPath } from 'ramda';
|
||||
import { z } from 'zod';
|
||||
import { BaseImportProvider } from '../base-provider';
|
||||
|
||||
export const zUmamiRawEvent = z.object({
|
||||
// Required fields
|
||||
event_type: z.coerce.number(),
|
||||
event_name: z.string(),
|
||||
created_at: z.coerce.date(),
|
||||
event_id: z.string().min(1),
|
||||
session_id: z.string().min(1),
|
||||
website_id: z.string().min(1),
|
||||
|
||||
// Optional fields that might be empty
|
||||
visit_id: z.string().optional(),
|
||||
distinct_id: z.string().optional(),
|
||||
url_path: z.string().optional(),
|
||||
hostname: z.string().optional(),
|
||||
referrer_domain: z.string().optional(),
|
||||
referrer_path: z.string().optional(),
|
||||
referrer_query: z.string().optional(),
|
||||
referrer_name: z.string().optional(),
|
||||
referrer_type: z.string().optional(),
|
||||
country: z.string().optional(),
|
||||
city: z.string().optional(),
|
||||
region: z.string().optional(),
|
||||
browser: z.string().optional(),
|
||||
os: z.string().optional(),
|
||||
device: z.string().optional(),
|
||||
screen: z.string().optional(),
|
||||
language: z.string().optional(),
|
||||
utm_source: z.string().optional(),
|
||||
utm_medium: z.string().optional(),
|
||||
utm_campaign: z.string().optional(),
|
||||
utm_content: z.string().optional(),
|
||||
utm_term: z.string().optional(),
|
||||
page_title: z.string().optional(),
|
||||
gclid: z.string().optional(),
|
||||
fbclid: z.string().optional(),
|
||||
msclkid: z.string().optional(),
|
||||
ttclid: z.string().optional(),
|
||||
li_fat_id: z.string().optional(),
|
||||
twclid: z.string().optional(),
|
||||
url_query: z.string().optional(),
|
||||
});
|
||||
export type UmamiRawEvent = z.infer<typeof zUmamiRawEvent>;
|
||||
|
||||
export class UmamiProvider extends BaseImportProvider<UmamiRawEvent> {
|
||||
provider = 'umami';
|
||||
version = '1.0.0';
|
||||
|
||||
constructor(
|
||||
private readonly projectId: string,
|
||||
private readonly config: IUmamiImportConfig,
|
||||
private readonly logger?: ILogger,
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
async getTotalEventsCount(): Promise<number> {
|
||||
return -1;
|
||||
}
|
||||
|
||||
async *parseSource(): AsyncGenerator<UmamiRawEvent, void, unknown> {
|
||||
yield* this.parseRemoteFile(this.config.fileUrl);
|
||||
}
|
||||
|
||||
private async *parseRemoteFile(
|
||||
url: string,
|
||||
opts: {
|
||||
signal?: AbortSignal;
|
||||
maxBytes?: number;
|
||||
maxRows?: number;
|
||||
} = {},
|
||||
): AsyncGenerator<UmamiRawEvent, void, unknown> {
|
||||
const { signal, maxBytes, maxRows } = opts;
|
||||
const controller = new AbortController();
|
||||
|
||||
// Link to caller's signal for cancellation
|
||||
if (signal) {
|
||||
signal.addEventListener('abort', () => controller.abort(), {
|
||||
once: true,
|
||||
});
|
||||
}
|
||||
|
||||
const res = await fetch(url, { signal: controller.signal });
|
||||
if (!res.ok || !res.body) {
|
||||
throw new Error(
|
||||
`Failed to fetch remote file: ${res.status} ${res.statusText}`,
|
||||
);
|
||||
}
|
||||
|
||||
const contentType = res.headers.get('content-type') || '';
|
||||
const contentEnc = res.headers.get('content-encoding') || '';
|
||||
const contentLen = Number(res.headers.get('content-length') ?? 0);
|
||||
|
||||
if (
|
||||
contentType &&
|
||||
!/text\/csv|text\/plain|application\/gzip|application\/octet-stream/i.test(
|
||||
contentType,
|
||||
)
|
||||
) {
|
||||
console.warn(`Warning: Content-Type is ${contentType}, expected CSV-ish`);
|
||||
}
|
||||
|
||||
if (maxBytes && contentLen && contentLen > maxBytes) {
|
||||
throw new Error(
|
||||
`Remote file exceeds size limit (${contentLen} > ${maxBytes})`,
|
||||
);
|
||||
}
|
||||
|
||||
const looksGzip =
|
||||
/\.gz($|\?)/i.test(url) ||
|
||||
/gzip/i.test(contentEnc) ||
|
||||
/application\/gzip/i.test(contentType);
|
||||
const looksBr = /br/i.test(contentEnc) || /\.br($|\?)/i.test(url);
|
||||
|
||||
// WHATWG -> Node stream
|
||||
const body = Readable.fromWeb(res.body as any);
|
||||
|
||||
// Optional size guard during stream
|
||||
let seenBytes = 0;
|
||||
if (maxBytes) {
|
||||
body.on('data', (chunk: Buffer) => {
|
||||
seenBytes += chunk.length;
|
||||
if (seenBytes > maxBytes) {
|
||||
controller.abort();
|
||||
body.destroy(
|
||||
new Error(
|
||||
`Stream exceeded size limit (${seenBytes} > ${maxBytes})`,
|
||||
),
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Build decode chain (gzip/brotli -> CSV parser)
|
||||
const decompress = looksGzip
|
||||
? createGunzip()
|
||||
: looksBr
|
||||
? createBrotliDecompress()
|
||||
: null;
|
||||
|
||||
const parser = parse({
|
||||
columns: true, // objects per row
|
||||
bom: true, // handle UTF-8 BOM
|
||||
relax_column_count: true,
|
||||
skip_empty_lines: true,
|
||||
});
|
||||
|
||||
// Wire the pipeline for proper backpressure & error propagation
|
||||
(async () => {
|
||||
try {
|
||||
if (decompress) {
|
||||
await pipeline(body, decompress, parser, {
|
||||
signal: controller.signal,
|
||||
});
|
||||
} else {
|
||||
await pipeline(body, parser, { signal: controller.signal });
|
||||
}
|
||||
} catch (e) {
|
||||
parser.destroy(e as Error);
|
||||
}
|
||||
})().catch(() => {
|
||||
/* handled by iterator */
|
||||
});
|
||||
|
||||
let rows = 0;
|
||||
try {
|
||||
for await (const record of parser) {
|
||||
rows++;
|
||||
if (maxRows && rows > maxRows) {
|
||||
controller.abort();
|
||||
throw new Error(`Row limit exceeded (${rows} > ${maxRows})`);
|
||||
}
|
||||
yield record as UmamiRawEvent;
|
||||
}
|
||||
} catch (err) {
|
||||
throw new Error(
|
||||
`Failed to parse remote file from ${url}: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
} finally {
|
||||
controller.abort(); // ensure fetch stream is torn down
|
||||
}
|
||||
}
|
||||
|
||||
validate(rawEvent: UmamiRawEvent): boolean {
|
||||
const res = zUmamiRawEvent.safeParse(rawEvent);
|
||||
return res.success;
|
||||
}
|
||||
|
||||
transformEvent(_rawEvent: UmamiRawEvent): IClickhouseEvent {
|
||||
const projectId =
|
||||
this.config.projectMapper.find(
|
||||
(mapper) => mapper.from === _rawEvent.website_id,
|
||||
)?.to || this.projectId;
|
||||
|
||||
const rawEvent = zUmamiRawEvent.parse(_rawEvent);
|
||||
// Extract device/profile ID - use visit_id as device_id, session_id for session tracking
|
||||
const deviceId =
|
||||
rawEvent.visit_id ||
|
||||
generateDeviceId({
|
||||
ip: rawEvent.visit_id!,
|
||||
ua: rawEvent.visit_id!,
|
||||
origin: projectId,
|
||||
salt: 'xxx',
|
||||
});
|
||||
const profileId = rawEvent.distinct_id || deviceId;
|
||||
|
||||
// Parse URL if available - use same logic as real-time events
|
||||
const url = rawEvent.url_path
|
||||
? `https://${[rawEvent.hostname, rawEvent.url_path, rawEvent.url_query]
|
||||
.filter(Boolean)
|
||||
.join('')}`
|
||||
: '';
|
||||
const { path, hash, query, origin } = parsePath(url);
|
||||
// Extract referrer information - use same logic as real-time events
|
||||
const referrerUrl = rawEvent.referrer_domain
|
||||
? `https://${rawEvent.referrer_domain}${rawEvent.referrer_path || ''}`
|
||||
: '';
|
||||
|
||||
// Check if referrer is from same domain (like real-time events do)
|
||||
const referrer = isSameDomain(referrerUrl, url)
|
||||
? null
|
||||
: parseReferrer(referrerUrl);
|
||||
|
||||
// Check for UTM referrer in query params (like real-time events do)
|
||||
const utmReferrer = getReferrerWithQuery(query);
|
||||
|
||||
// Extract location data
|
||||
const country = rawEvent.country || '';
|
||||
const city = rawEvent.city || '';
|
||||
const region = rawEvent.region || '';
|
||||
|
||||
// Extract browser/device info
|
||||
const browser = rawEvent.browser || '';
|
||||
const browserVersion = ''; // Not available in Umami CSV
|
||||
const os = rawEvent.os || '';
|
||||
const osVersion = ''; // Not available in Umami CSV
|
||||
const device = rawEvent.device || '';
|
||||
const brand = ''; // Not available in Umami CSV
|
||||
const model = ''; // Not available in Umami CSV
|
||||
|
||||
let properties: Record<string, any> = {};
|
||||
|
||||
if (query) {
|
||||
properties.__query = query;
|
||||
}
|
||||
|
||||
// Add useful properties from Umami data
|
||||
if (rawEvent.page_title) properties.__title = rawEvent.page_title;
|
||||
if (rawEvent.screen) properties.__screen = rawEvent.screen;
|
||||
if (rawEvent.language) properties.__language = rawEvent.language;
|
||||
if (rawEvent.utm_source)
|
||||
properties = assocPath(
|
||||
['__query', 'utm_source'],
|
||||
rawEvent.utm_source,
|
||||
properties,
|
||||
);
|
||||
if (rawEvent.utm_medium)
|
||||
properties = assocPath(
|
||||
['__query', 'utm_medium'],
|
||||
rawEvent.utm_medium,
|
||||
properties,
|
||||
);
|
||||
if (rawEvent.utm_campaign)
|
||||
properties = assocPath(
|
||||
['__query', 'utm_campaign'],
|
||||
rawEvent.utm_campaign,
|
||||
properties,
|
||||
);
|
||||
if (rawEvent.utm_content)
|
||||
properties = assocPath(
|
||||
['__query', 'utm_content'],
|
||||
rawEvent.utm_content,
|
||||
properties,
|
||||
);
|
||||
if (rawEvent.utm_term)
|
||||
properties = assocPath(
|
||||
['__query', 'utm_term'],
|
||||
rawEvent.utm_term,
|
||||
properties,
|
||||
);
|
||||
|
||||
return {
|
||||
id: rawEvent.event_id || randomUUID(),
|
||||
name: rawEvent.event_type === 1 ? 'screen_view' : rawEvent.event_name,
|
||||
device_id: deviceId,
|
||||
profile_id: profileId,
|
||||
project_id: projectId,
|
||||
session_id: rawEvent.session_id || '',
|
||||
properties,
|
||||
created_at: rawEvent.created_at.toISOString(),
|
||||
country,
|
||||
city,
|
||||
region: this.mapRegion(region),
|
||||
longitude: null,
|
||||
latitude: null,
|
||||
os,
|
||||
os_version: osVersion,
|
||||
browser: this.mapBrowser(browser),
|
||||
browser_version: browserVersion,
|
||||
device: this.mapDevice(device),
|
||||
brand,
|
||||
model,
|
||||
duration: 0,
|
||||
path,
|
||||
origin,
|
||||
referrer: utmReferrer?.url || referrer?.url || '',
|
||||
referrer_name: utmReferrer?.name || referrer?.name || '',
|
||||
referrer_type: utmReferrer?.type || referrer?.type || '',
|
||||
imported_at: new Date().toISOString(),
|
||||
sdk_name: this.provider,
|
||||
sdk_version: this.version,
|
||||
};
|
||||
}
|
||||
|
||||
mapRegion(region: string): string {
|
||||
return region.replace(/^[A-Z]{2}\-/, '');
|
||||
}
|
||||
|
||||
mapDevice(device: string): string {
|
||||
const mapping: Record<string, string> = {
|
||||
desktop: 'desktop',
|
||||
laptop: 'desktop',
|
||||
mobile: 'mobile',
|
||||
tablet: 'tablet',
|
||||
smarttv: 'smarttv',
|
||||
Unknown: 'desktop',
|
||||
};
|
||||
|
||||
return mapping[device] || 'desktop';
|
||||
}
|
||||
|
||||
mapBrowser(browser: string): string {
|
||||
const mapping: Record<string, string> = {
|
||||
android: 'Android',
|
||||
aol: 'AOL',
|
||||
bb10: 'BlackBerry 10',
|
||||
beaker: 'Beaker',
|
||||
chrome: 'Chrome',
|
||||
'chromium-webview': 'Chrome (webview)',
|
||||
crios: 'Chrome (iOS)',
|
||||
curl: 'Curl',
|
||||
edge: 'Edge',
|
||||
'edge-chromium': 'Edge (Chromium)',
|
||||
'edge-ios': 'Edge (iOS)',
|
||||
facebook: 'Facebook',
|
||||
firefox: 'Firefox',
|
||||
fxios: 'Firefox (iOS)',
|
||||
ie: 'IE',
|
||||
instagram: 'Instagram',
|
||||
ios: 'iOS',
|
||||
'ios-webview': 'iOS (webview)',
|
||||
kakaotalk: 'KakaoTalk',
|
||||
miui: 'MIUI',
|
||||
opera: 'Opera',
|
||||
'opera-mini': 'Opera Mini',
|
||||
phantomjs: 'PhantomJS',
|
||||
safari: 'Safari',
|
||||
samsung: 'Samsung',
|
||||
searchbot: 'Searchbot',
|
||||
silk: 'Silk',
|
||||
yandexbrowser: 'Yandex',
|
||||
};
|
||||
|
||||
return mapping[browser] || browser || 'Unknown';
|
||||
}
|
||||
}
|
||||
80
packages/importer/src/types.ts
Normal file
80
packages/importer/src/types.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import type {
|
||||
IImportedEvent,
|
||||
IServiceCreateEventPayload,
|
||||
IServiceImportedEventPayload,
|
||||
} from '@openpanel/db';
|
||||
|
||||
export interface ImportConfig {
|
||||
projectId: string;
|
||||
provider: string;
|
||||
sourceType: 'file' | 'api';
|
||||
sourceLocation: string;
|
||||
}
|
||||
|
||||
export interface SessionInfo {
|
||||
id: string;
|
||||
lastTimestamp: number;
|
||||
lastEvent: IServiceImportedEventPayload;
|
||||
}
|
||||
|
||||
export interface ImportProgress {
|
||||
totalEvents: number;
|
||||
processedEvents: number;
|
||||
currentBatch: number;
|
||||
totalBatches: number;
|
||||
}
|
||||
|
||||
export interface ImportResult {
|
||||
success: boolean;
|
||||
totalEvents: number;
|
||||
processedEvents: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface BatchResult {
|
||||
events: IServiceImportedEventPayload[];
|
||||
sessionEvents: IServiceImportedEventPayload[];
|
||||
}
|
||||
|
||||
// Generic types for raw events from different providers
|
||||
export interface BaseRawEvent {
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
// Error context for better error handling
|
||||
export interface ErrorContext {
|
||||
batchNumber?: number;
|
||||
batchSize?: number;
|
||||
eventIndex?: number;
|
||||
rawEvent?: BaseRawEvent;
|
||||
provider?: string;
|
||||
}
|
||||
|
||||
// Properties type for events - more specific than Record<string, any>
|
||||
export interface EventProperties {
|
||||
[key: string]:
|
||||
| string
|
||||
| number
|
||||
| boolean
|
||||
| null
|
||||
| undefined
|
||||
| Record<string, unknown>;
|
||||
__query?: Record<string, unknown>;
|
||||
__title?: string;
|
||||
__screen?: string;
|
||||
__language?: string;
|
||||
}
|
||||
|
||||
// Import job metadata for tracking import progress
|
||||
export interface ImportJobMetadata {
|
||||
importId: string;
|
||||
importStatus: 'pending' | 'processing' | 'processed' | 'failed';
|
||||
importedAt: Date;
|
||||
}
|
||||
|
||||
// Result of import staging operations
|
||||
export interface ImportStageResult {
|
||||
importId: string;
|
||||
totalEvents: number;
|
||||
insertedEvents: number;
|
||||
}
|
||||
9
packages/importer/tsconfig.json
Normal file
9
packages/importer/tsconfig.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"extends": "../../tooling/typescript/base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src"
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
3
packages/importer/vitest.config.ts
Normal file
3
packages/importer/vitest.config.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
import { getSharedVitestConfig } from '../../vitest.shared';
|
||||
|
||||
export default getSharedVitestConfig({ __dirname });
|
||||
@@ -110,7 +110,6 @@ export const eventsGroupQueue = new GroupQueue<
|
||||
>({
|
||||
logger: queueLogger,
|
||||
namespace: 'group_events',
|
||||
// @ts-expect-error - TODO: Fix this in groupmq
|
||||
redis: getRedisGroupQueue(),
|
||||
orderingMethod: 'in-memory',
|
||||
orderingWindowMs,
|
||||
@@ -166,6 +165,21 @@ export const notificationQueue = new Queue<NotificationQueuePayload>(
|
||||
},
|
||||
);
|
||||
|
||||
export type ImportQueuePayload = {
|
||||
type: 'import';
|
||||
payload: {
|
||||
importId: string;
|
||||
};
|
||||
};
|
||||
|
||||
export const importQueue = new Queue<ImportQueuePayload>('import', {
|
||||
connection: getRedisQueue(),
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: 10,
|
||||
removeOnFail: 50,
|
||||
},
|
||||
});
|
||||
|
||||
export function addTrialEndingSoonJob(organizationId: string, delay: number) {
|
||||
return miscQueue.add(
|
||||
'misc',
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@openpanel/json": "workspace:*",
|
||||
"ioredis": "^5.7.0"
|
||||
"ioredis": "5.8.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@openpanel/db": "workspace:*",
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
"@openpanel/payments": "workspace:^",
|
||||
"@openpanel/redis": "workspace:*",
|
||||
"@openpanel/validation": "workspace:*",
|
||||
"@openpanel/queue": "workspace:*",
|
||||
"@trpc-limiter/redis": "^0.0.2",
|
||||
"@trpc/client": "^11.6.0",
|
||||
"@trpc/server": "^11.6.0",
|
||||
|
||||
@@ -4,6 +4,7 @@ import { chatRouter } from './routers/chat';
|
||||
import { clientRouter } from './routers/client';
|
||||
import { dashboardRouter } from './routers/dashboard';
|
||||
import { eventRouter } from './routers/event';
|
||||
import { importRouter } from './routers/import';
|
||||
import { integrationRouter } from './routers/integration';
|
||||
import { notificationRouter } from './routers/notification';
|
||||
import { onboardingRouter } from './routers/onboarding';
|
||||
@@ -40,6 +41,7 @@ export const appRouter = createTRPCRouter({
|
||||
reference: referenceRouter,
|
||||
notification: notificationRouter,
|
||||
integration: integrationRouter,
|
||||
import: importRouter,
|
||||
auth: authRouter,
|
||||
subscription: subscriptionRouter,
|
||||
overview: overviewRouter,
|
||||
|
||||
@@ -12,7 +12,7 @@ import {
|
||||
validateSessionToken,
|
||||
verifyPasswordHash,
|
||||
} from '@openpanel/auth';
|
||||
import { generateSecureId } from '@openpanel/common/server/id';
|
||||
import { generateSecureId } from '@openpanel/common/server';
|
||||
import {
|
||||
connectUserToOrganization,
|
||||
db,
|
||||
|
||||
178
packages/trpc/src/routers/import.ts
Normal file
178
packages/trpc/src/routers/import.ts
Normal file
@@ -0,0 +1,178 @@
|
||||
import { z } from 'zod';
|
||||
|
||||
import { db } from '@openpanel/db';
|
||||
import { importQueue } from '@openpanel/queue';
|
||||
import { zCreateImport } from '@openpanel/validation';
|
||||
|
||||
import { getProjectAccess } from '../access';
|
||||
import { TRPCAccessError } from '../errors';
|
||||
import { createTRPCRouter, protectedProcedure } from '../trpc';
|
||||
|
||||
export const importRouter = createTRPCRouter({
|
||||
list: protectedProcedure
|
||||
.input(z.object({ projectId: z.string() }))
|
||||
.query(async ({ input, ctx }) => {
|
||||
const access = await getProjectAccess({
|
||||
projectId: input.projectId,
|
||||
userId: ctx.session.userId,
|
||||
});
|
||||
|
||||
if (!access) {
|
||||
throw TRPCAccessError('You do not have access to this project');
|
||||
}
|
||||
|
||||
return db.import.findMany({
|
||||
where: {
|
||||
projectId: input.projectId,
|
||||
},
|
||||
orderBy: {
|
||||
createdAt: 'desc',
|
||||
},
|
||||
});
|
||||
}),
|
||||
|
||||
get: protectedProcedure
|
||||
.input(z.object({ id: z.string() }))
|
||||
.query(async ({ input, ctx }) => {
|
||||
const importRecord = await db.import.findUniqueOrThrow({
|
||||
where: {
|
||||
id: input.id,
|
||||
},
|
||||
include: {
|
||||
project: true,
|
||||
},
|
||||
});
|
||||
|
||||
const access = await getProjectAccess({
|
||||
projectId: importRecord.projectId,
|
||||
userId: ctx.session.userId,
|
||||
});
|
||||
|
||||
if (!access) {
|
||||
throw TRPCAccessError('You do not have access to this import');
|
||||
}
|
||||
|
||||
return importRecord;
|
||||
}),
|
||||
|
||||
create: protectedProcedure
|
||||
.input(zCreateImport)
|
||||
.mutation(async ({ input, ctx }) => {
|
||||
const access = await getProjectAccess({
|
||||
projectId: input.projectId,
|
||||
userId: ctx.session.userId,
|
||||
});
|
||||
|
||||
if (!access || (typeof access !== 'boolean' && access.level === 'read')) {
|
||||
throw TRPCAccessError(
|
||||
'You do not have permission to create imports for this project',
|
||||
);
|
||||
}
|
||||
|
||||
// Create import record
|
||||
const importRecord = await db.import.create({
|
||||
data: {
|
||||
projectId: input.projectId,
|
||||
config: input.config,
|
||||
status: 'pending',
|
||||
},
|
||||
});
|
||||
|
||||
// Add job to queue
|
||||
const job = await importQueue.add('import', {
|
||||
type: 'import',
|
||||
payload: {
|
||||
importId: importRecord.id,
|
||||
},
|
||||
});
|
||||
|
||||
// Update import record with job ID
|
||||
await db.import.update({
|
||||
where: { id: importRecord.id },
|
||||
data: { jobId: job.id },
|
||||
});
|
||||
|
||||
return {
|
||||
...importRecord,
|
||||
jobId: job.id,
|
||||
};
|
||||
}),
|
||||
|
||||
delete: protectedProcedure
|
||||
.input(z.object({ id: z.string() }))
|
||||
.mutation(async ({ input, ctx }) => {
|
||||
const importRecord = await db.import.findUniqueOrThrow({
|
||||
where: {
|
||||
id: input.id,
|
||||
},
|
||||
});
|
||||
|
||||
const access = await getProjectAccess({
|
||||
projectId: importRecord.projectId,
|
||||
userId: ctx.session.userId,
|
||||
});
|
||||
|
||||
if (!access || (typeof access !== 'boolean' && access.level === 'read')) {
|
||||
throw TRPCAccessError(
|
||||
'You do not have permission to delete imports for this project',
|
||||
);
|
||||
}
|
||||
|
||||
if (importRecord.jobId) {
|
||||
const job = await importQueue.getJob(importRecord.jobId);
|
||||
if (job) {
|
||||
await job.remove();
|
||||
}
|
||||
}
|
||||
|
||||
return db.import.delete({
|
||||
where: {
|
||||
id: input.id,
|
||||
},
|
||||
});
|
||||
}),
|
||||
|
||||
retry: protectedProcedure
|
||||
.input(z.object({ id: z.string() }))
|
||||
.mutation(async ({ input, ctx }) => {
|
||||
const importRecord = await db.import.findUniqueOrThrow({
|
||||
where: {
|
||||
id: input.id,
|
||||
},
|
||||
});
|
||||
|
||||
const access = await getProjectAccess({
|
||||
projectId: importRecord.projectId,
|
||||
userId: ctx.session.userId,
|
||||
});
|
||||
|
||||
if (!access || (typeof access !== 'boolean' && access.level === 'read')) {
|
||||
throw TRPCAccessError(
|
||||
'You do not have permission to retry imports for this project',
|
||||
);
|
||||
}
|
||||
|
||||
// Only allow retry for failed imports
|
||||
if (importRecord.status !== 'failed') {
|
||||
throw new Error('Only failed imports can be retried');
|
||||
}
|
||||
|
||||
// Add new job to queue
|
||||
const job = await importQueue.add('import', {
|
||||
type: 'import',
|
||||
payload: {
|
||||
importId: importRecord.id,
|
||||
},
|
||||
});
|
||||
|
||||
// Update import record
|
||||
return db.import.update({
|
||||
where: { id: importRecord.id },
|
||||
data: {
|
||||
jobId: job.id,
|
||||
status: 'pending',
|
||||
errorMessage: null,
|
||||
},
|
||||
});
|
||||
}),
|
||||
});
|
||||
@@ -11,7 +11,7 @@ import {
|
||||
} from '@openpanel/db';
|
||||
import { zEditOrganization, zInviteUser } from '@openpanel/validation';
|
||||
|
||||
import { generateSecureId } from '@openpanel/common/server/id';
|
||||
import { generateSecureId } from '@openpanel/common/server';
|
||||
import { sendEmail } from '@openpanel/email';
|
||||
import { addDays } from 'date-fns';
|
||||
import { getOrganizationAccess } from '../access';
|
||||
|
||||
@@ -435,3 +435,54 @@ export const zEditOrganization = z.object({
|
||||
name: z.string().min(2),
|
||||
timezone: z.string().min(1),
|
||||
});
|
||||
|
||||
const zProjectMapper = z.object({
|
||||
from: z.string().min(1),
|
||||
to: z.string().min(1),
|
||||
});
|
||||
|
||||
const createFileImportConfig = <T extends string>(provider: T) =>
|
||||
z.object({
|
||||
provider: z.literal(provider),
|
||||
type: z.literal('file'),
|
||||
fileUrl: z.string().url(),
|
||||
});
|
||||
|
||||
// Import configs
|
||||
export const zUmamiImportConfig = createFileImportConfig('umami').extend({
|
||||
projectMapper: z.array(zProjectMapper),
|
||||
});
|
||||
|
||||
export type IUmamiImportConfig = z.infer<typeof zUmamiImportConfig>;
|
||||
|
||||
export const zPlausibleImportConfig = createFileImportConfig('plausible');
|
||||
export type IPlausibleImportConfig = z.infer<typeof zPlausibleImportConfig>;
|
||||
|
||||
export const zMixpanelImportConfig = z.object({
|
||||
provider: z.literal('mixpanel'),
|
||||
type: z.literal('api'),
|
||||
serviceAccount: z.string().min(1),
|
||||
serviceSecret: z.string().min(1),
|
||||
projectId: z.string().min(1),
|
||||
from: z.string().min(1),
|
||||
to: z.string().min(1),
|
||||
mapScreenViewProperty: z.string().optional(),
|
||||
});
|
||||
export type IMixpanelImportConfig = z.infer<typeof zMixpanelImportConfig>;
|
||||
|
||||
export type IImportConfig =
|
||||
| IUmamiImportConfig
|
||||
| IPlausibleImportConfig
|
||||
| IMixpanelImportConfig;
|
||||
|
||||
export const zCreateImport = z.object({
|
||||
projectId: z.string().min(1),
|
||||
provider: z.enum(['umami', 'plausible', 'mixpanel']),
|
||||
config: z.union([
|
||||
zUmamiImportConfig,
|
||||
zPlausibleImportConfig,
|
||||
zMixpanelImportConfig,
|
||||
]),
|
||||
});
|
||||
|
||||
export type ICreateImport = z.infer<typeof zCreateImport>;
|
||||
|
||||
Reference in New Issue
Block a user