feat: new importer (#214)

This commit is contained in:
Carl-Gerhard Lindesvärd
2025-11-05 09:49:36 +01:00
committed by GitHub
parent b51bc8f3f6
commit 212254d31a
80 changed files with 4884 additions and 842 deletions

View File

@@ -35,6 +35,7 @@ COPY packages/redis/package.json ./packages/redis/
COPY packages/queue/package.json ./packages/queue/
COPY packages/logger/package.json ./packages/logger/
COPY packages/common/package.json ./packages/common/
COPY packages/importer/package.json ./packages/importer/
COPY packages/constants/package.json ./packages/constants/
COPY packages/validation/package.json ./packages/validation/
COPY packages/integrations/package.json packages/integrations/
@@ -80,9 +81,10 @@ COPY --from=build /app/packages/geo ./packages/geo
COPY --from=build /app/packages/json ./packages/json
COPY --from=build /app/packages/email ./packages/email
COPY --from=build /app/packages/redis ./packages/redis
COPY --from=build /app/packages/logger ./packages/logger
COPY --from=build /app/packages/queue ./packages/queue
COPY --from=build /app/packages/logger ./packages/logger
COPY --from=build /app/packages/common ./packages/common
COPY --from=build /app/packages/importer ./packages/importer
COPY --from=build /app/packages/validation ./packages/validation
COPY --from=build /app/packages/integrations ./packages/integrations
COPY --from=build /app/tooling/typescript ./tooling/typescript

View File

@@ -8,8 +8,7 @@
"testing": "WORKER_PORT=9999 pnpm dev",
"start": "node dist/index.js",
"build": "rm -rf dist && tsdown",
"typecheck": "tsc --noEmit",
"gen:referrers": "jiti scripts/get-referrers.ts && biome format --write ./src/referrers/index.ts"
"typecheck": "tsc --noEmit"
},
"dependencies": {
"@bull-board/api": "6.13.1",
@@ -20,6 +19,7 @@
"@openpanel/integrations": "workspace:^",
"@openpanel/json": "workspace:*",
"@openpanel/logger": "workspace:*",
"@openpanel/importer": "workspace:*",
"@openpanel/queue": "workspace:*",
"@openpanel/redis": "workspace:*",
"bullmq": "^5.8.7",
@@ -38,7 +38,7 @@
"@types/source-map-support": "^0.5.10",
"@types/sqlstring": "^2.3.2",
"@types/uuid": "^9.0.8",
"tsdown": "^0.14.2",
"tsdown": "0.14.2",
"typescript": "catalog:"
}
}

View File

@@ -5,6 +5,7 @@ import {
type EventsQueuePayloadIncomingEvent,
cronQueue,
eventsGroupQueue,
importQueue,
miscQueue,
notificationQueue,
queueLogger,
@@ -19,6 +20,7 @@ import { Worker as GroupWorker } from 'groupmq';
import { cronJob } from './jobs/cron';
import { eventsJob } from './jobs/events';
import { incomingEventPure } from './jobs/events.incoming-event';
import { importJob } from './jobs/import';
import { miscJob } from './jobs/misc';
import { notificationJob } from './jobs/notification';
import { sessionsJob } from './jobs/sessions';
@@ -56,13 +58,18 @@ export async function bootWorkers() {
workerOptions,
);
const miscWorker = new Worker(miscQueue.name, miscJob, workerOptions);
const importWorker = new Worker(importQueue.name, importJob, {
...workerOptions,
concurrency: Number.parseInt(process.env.IMPORT_JOB_CONCURRENCY || '1', 10),
});
const workers = [
sessionsWorker,
cronWorker,
notificationWorker,
miscWorker,
eventsGroupWorker,
importWorker,
// eventsGroupWorker,
];
workers.forEach((worker) => {
@@ -148,7 +155,15 @@ export async function bootWorkers() {
['uncaughtException', 'unhandledRejection', 'SIGTERM', 'SIGINT'].forEach(
(evt) => {
process.on(evt, (code) => {
exitHandler(evt, code);
if (process.env.NODE_ENV === 'production') {
exitHandler(evt, code);
} else {
logger.info('Shutting down for development', {
event: evt,
code,
});
process.exit(0);
}
});
},
);

View File

@@ -5,6 +5,7 @@ import { createInitialSalts } from '@openpanel/db';
import {
cronQueue,
eventsGroupQueue,
importQueue,
miscQueue,
notificationQueue,
sessionsQueue,
@@ -31,13 +32,14 @@ async function start() {
if (process.env.DISABLE_BULLBOARD === undefined) {
const serverAdapter = new ExpressAdapter();
serverAdapter.setBasePath('/');
({
createBullBoard({
queues: [
new BullBoardGroupMQAdapter(eventsGroupQueue) as any,
new BullMQAdapter(sessionsQueue),
new BullMQAdapter(cronQueue),
new BullMQAdapter(notificationQueue),
new BullMQAdapter(miscQueue),
new BullMQAdapter(importQueue),
],
serverAdapter: serverAdapter,
});

View File

@@ -54,7 +54,7 @@ export async function deleteProjects(job: Job<CronQueuePayload>) {
await ch.command({
query,
clickhouse_settings: {
lightweight_deletes_sync: 0,
lightweight_deletes_sync: '0',
},
});
}

View File

@@ -1,12 +1,15 @@
import { logger as baseLogger } from '@/utils/logger';
import { getReferrerWithQuery, parseReferrer } from '@/utils/parse-referrer';
import {
createSessionEndJob,
createSessionStart,
getSessionEnd,
} from '@/utils/session-handler';
import { isSameDomain, parsePath } from '@openpanel/common';
import { parseUserAgent } from '@openpanel/common/server';
import {
getReferrerWithQuery,
parseReferrer,
parseUserAgent,
} from '@openpanel/common/server';
import type { IServiceCreateEventPayload, IServiceEvent } from '@openpanel/db';
import {
checkNotificationRulesForEvent,
@@ -15,10 +18,9 @@ import {
} from '@openpanel/db';
import type { ILogger } from '@openpanel/logger';
import type { EventsQueuePayloadIncomingEvent } from '@openpanel/queue';
import { getLock } from '@openpanel/redis';
import { DelayedError, type Job } from 'bullmq';
import { omit } from 'ramda';
import type { Job } from 'bullmq';
import * as R from 'ramda';
import { omit } from 'ramda';
import { v4 as uuid } from 'uuid';
const GLOBAL_PROPERTIES = ['__path', '__referrer'];
@@ -115,9 +117,9 @@ export async function incomingEventPure(
latitude: geo.latitude,
path,
origin,
referrer: utmReferrer?.url || referrer?.url || '',
referrer: referrer?.url || '',
referrerName: utmReferrer?.name || referrer?.name || '',
referrerType: utmReferrer?.type || referrer?.type || '',
referrerType: referrer?.type || utmReferrer?.type || '',
os: uaInfo.os,
osVersion: uaInfo.osVersion,
browser: uaInfo.browser,

View File

@@ -99,7 +99,7 @@ describe('incomingEvent', () => {
origin: 'https://example.com',
referrer: '',
referrerName: '',
referrerType: 'unknown',
referrerType: '',
sdkName: jobData.payload.headers['openpanel-sdk-name'],
sdkVersion: jobData.payload.headers['openpanel-sdk-version'],
};
@@ -207,7 +207,7 @@ describe('incomingEvent', () => {
origin: 'https://example.com',
referrer: '',
referrerName: '',
referrerType: 'unknown',
referrerType: '',
sdkName: jobData.payload.headers['openpanel-sdk-name'],
sdkVersion: jobData.payload.headers['openpanel-sdk-version'],
};

View File

@@ -0,0 +1,332 @@
import {
type IClickhouseEvent,
type ImportSteps,
type Prisma,
backfillSessionsToProduction,
createSessionsStartEndEvents,
db,
formatClickhouseDate,
generateSessionIds,
getImportDateBounds,
getImportProgress,
insertImportBatch,
markImportComplete,
moveImportsToProduction,
updateImportStatus,
} from '@openpanel/db';
import { MixpanelProvider, UmamiProvider } from '@openpanel/importer';
import type { ILogger } from '@openpanel/logger';
import type { ImportQueuePayload } from '@openpanel/queue';
import type { Job } from 'bullmq';
import { logger } from '../utils/logger';
const BATCH_SIZE = Number.parseInt(process.env.IMPORT_BATCH_SIZE || '5000', 10);
/**
* Yields control back to the event loop to prevent stalled jobs
*/
async function yieldToEventLoop(): Promise<void> {
return new Promise((resolve) => {
setTimeout(resolve, 100);
});
}
export async function importJob(job: Job<ImportQueuePayload>) {
const { importId } = job.data.payload;
const record = await db.import.findUniqueOrThrow({
where: { id: importId },
include: {
project: true,
},
});
const jobLogger = logger.child({
importId,
config: record.config,
});
type ValidStep = Exclude<ImportSteps, 'failed' | 'completed'>;
const steps: Record<ValidStep, number> = {
loading: 0,
generating_session_ids: 1,
creating_sessions: 2,
moving: 3,
backfilling_sessions: 4,
};
jobLogger.info('Starting import job');
const providerInstance = createProvider(record, jobLogger);
try {
// Check if this is a resume operation
const isNewImport = record.currentStep === null;
if (isNewImport) {
await updateImportStatus(jobLogger, job, importId, {
step: 'loading',
});
} else {
jobLogger.info('Resuming import from previous state', {
currentStep: record.currentStep,
currentBatch: record.currentBatch,
});
}
// Try to get a precomputed total for better progress reporting
const totalEvents = await providerInstance
.getTotalEventsCount()
.catch(() => -1);
let processedEvents = record.processedEvents;
const resumeLoadingFrom =
(record.currentStep === 'loading' && record.currentBatch) || undefined;
const resumeGeneratingSessionIdsFrom =
(record.currentStep === 'generating_session_ids' &&
record.currentBatch) ||
undefined;
const resumeCreatingSessionsFrom =
(record.currentStep === 'creating_sessions' && record.currentBatch) ||
undefined;
const resumeMovingFrom =
(record.currentStep === 'moving' && record.currentBatch) || undefined;
const resumeBackfillingSessionsFrom =
(record.currentStep === 'backfilling_sessions' && record.currentBatch) ||
undefined;
// Example:
// shouldRunStep(0) // currStep = 2 (should not run)
// shouldRunStep(1) // currStep = 2 (should not run)
// shouldRunStep(2) // currStep = 2 (should run)
// shouldRunStep(3) // currStep = 2 (should run)
const shouldRunStep = (step: ValidStep) => {
if (isNewImport) {
return true;
}
const stepToRunIndex = steps[step];
const currentStepIndex = steps[record.currentStep as ValidStep];
return stepToRunIndex >= currentStepIndex;
};
async function whileBounds(
from: string | undefined,
callback: (from: string, to: string) => Promise<void>,
) {
const bounds = await getImportDateBounds(importId, from);
if (bounds.min && bounds.max) {
const start = new Date(bounds.min);
const end = new Date(bounds.max);
let cursor = new Date(start);
while (cursor < end) {
const next = new Date(cursor);
next.setDate(next.getDate() + 1);
await callback(
formatClickhouseDate(cursor, true),
formatClickhouseDate(next, true),
);
cursor = next;
// Yield control back to event loop after processing each day
await yieldToEventLoop();
}
}
}
// Phase 1: Fetch & Transform - Process events in batches
if (shouldRunStep('loading')) {
const eventBatch: any = [];
for await (const rawEvent of providerInstance.parseSource(
resumeLoadingFrom,
)) {
// Validate event
if (
!providerInstance.validate(
// @ts-expect-error
rawEvent,
)
) {
jobLogger.warn('Skipping invalid event', { rawEvent });
continue;
}
eventBatch.push(rawEvent);
// Process batch when it reaches the batch size
if (eventBatch.length >= BATCH_SIZE) {
jobLogger.info('Processing batch', { batchSize: eventBatch.length });
const transformedEvents: IClickhouseEvent[] = eventBatch.map(
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '')
.toISOString()
.split('T')[0];
await updateImportStatus(jobLogger, job, importId, {
step: 'loading',
batch: createdAt,
totalEvents,
processedEvents,
});
// Yield control back to event loop after processing each batch
await yieldToEventLoop();
}
}
// Process remaining events in the last batch
if (eventBatch.length > 0) {
const transformedEvents = eventBatch.map(
(
// @ts-expect-error
event,
) => providerInstance!.transformEvent(event),
);
await insertImportBatch(transformedEvents, importId);
processedEvents += eventBatch.length;
eventBatch.length = 0;
const createdAt = new Date(transformedEvents[0]?.created_at || '')
.toISOString()
.split('T')[0];
await updateImportStatus(jobLogger, job, importId, {
step: 'loading',
batch: createdAt,
});
// Yield control back to event loop after processing final batch
await yieldToEventLoop();
}
}
// Phase 2: Generate session IDs if provider requires it
if (
shouldRunStep('generating_session_ids') &&
providerInstance.shouldGenerateSessionIds()
) {
await whileBounds(resumeGeneratingSessionIdsFrom, async (from) => {
console.log('Generating session IDs', { from });
await generateSessionIds(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'generating_session_ids',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
jobLogger.info('Session ID generation complete');
}
// Phase 3-5: Process in daily batches for robustness
if (shouldRunStep('creating_sessions')) {
await whileBounds(resumeCreatingSessionsFrom, async (from) => {
await createSessionsStartEndEvents(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'creating_sessions',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
if (shouldRunStep('moving')) {
await whileBounds(resumeMovingFrom, async (from) => {
await moveImportsToProduction(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'moving',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
if (shouldRunStep('backfilling_sessions')) {
await whileBounds(resumeBackfillingSessionsFrom, async (from) => {
await backfillSessionsToProduction(importId, from);
await updateImportStatus(jobLogger, job, importId, {
step: 'backfilling_sessions',
batch: from,
});
// Yield control back to event loop after processing each day
await yieldToEventLoop();
});
}
await markImportComplete(importId);
await updateImportStatus(jobLogger, job, importId, {
step: 'completed',
});
jobLogger.info('Import marked as complete');
// Get final progress
const finalProgress = await getImportProgress(importId);
jobLogger.info('Import job completed successfully', {
totalEvents: finalProgress.totalEvents,
insertedEvents: finalProgress.insertedEvents,
status: finalProgress.status,
});
return {
success: true,
totalEvents: finalProgress.totalEvents,
processedEvents: finalProgress.insertedEvents,
};
} catch (error) {
jobLogger.error('Import job failed', { error });
// Mark import as failed
try {
const errorMsg = error instanceof Error ? error.message : 'Unknown error';
await updateImportStatus(jobLogger, job, importId, {
step: 'failed',
errorMessage: errorMsg,
});
jobLogger.warn('Import marked as failed', { error: errorMsg });
} catch (markError) {
jobLogger.error('Failed to mark import as failed', { error, markError });
}
throw error;
}
}
function createProvider(
record: Prisma.ImportGetPayload<{ include: { project: true } }>,
jobLogger: ILogger,
) {
const config = record.config;
switch (config.provider) {
case 'umami':
return new UmamiProvider(record.projectId, config, jobLogger);
case 'mixpanel':
return new MixpanelProvider(record.projectId, config, jobLogger);
default:
throw new Error(`Unknown provider: ${config.provider}`);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +0,0 @@
# Snowplow Referer Parser
The file index.ts in this dir is generated from snowplows referer database [Snowplow Referer Parser](https://github.com/snowplow-referer-parser/referer-parser).
The orginal [referers.yml](https://github.com/snowplow-referer-parser/referer-parser/blob/master/resources/referers.yml) is based on Piwik's SearchEngines.php and Socials.php, copyright 2012 Matthieu Aubry and available under the GNU General Public License v3.

View File

@@ -1,117 +0,0 @@
import { describe, expect, it } from 'vitest';
import { getReferrerWithQuery, parseReferrer } from './parse-referrer';
describe('parseReferrer', () => {
it('should handle undefined or empty URLs', () => {
expect(parseReferrer(undefined)).toEqual({
name: '',
type: 'unknown',
url: '',
});
expect(parseReferrer('')).toEqual({
name: '',
type: 'unknown',
url: '',
});
});
it('should parse valid referrer URLs', () => {
expect(parseReferrer('https://google.com/search?q=test')).toEqual({
name: 'Google',
type: 'search',
url: 'https://google.com/search?q=test',
});
});
it('should handle www prefix in hostnames', () => {
expect(parseReferrer('https://www.twitter.com/user')).toEqual({
name: 'Twitter',
type: 'social',
url: 'https://www.twitter.com/user',
});
expect(parseReferrer('https://twitter.com/user')).toEqual({
name: 'Twitter',
type: 'social',
url: 'https://twitter.com/user',
});
});
it('should handle unknown referrers', () => {
expect(parseReferrer('https://unknown-site.com')).toEqual({
name: '',
type: 'unknown',
url: 'https://unknown-site.com',
});
});
it('should handle invalid URLs', () => {
expect(parseReferrer('not-a-url')).toEqual({
name: '',
type: 'unknown',
url: 'not-a-url',
});
});
});
describe('getReferrerWithQuery', () => {
it('should handle undefined or empty query', () => {
expect(getReferrerWithQuery(undefined)).toBeNull();
expect(getReferrerWithQuery({})).toBeNull();
});
it('should parse utm_source parameter', () => {
expect(getReferrerWithQuery({ utm_source: 'google' })).toEqual({
name: 'Google',
type: 'unknown',
url: '',
});
});
it('should parse ref parameter', () => {
expect(getReferrerWithQuery({ ref: 'facebook' })).toEqual({
name: 'Facebook',
type: 'social',
url: '',
});
});
it('should parse utm_referrer parameter', () => {
expect(getReferrerWithQuery({ utm_referrer: 'twitter' })).toEqual({
name: 'Twitter',
type: 'social',
url: '',
});
});
it('should handle case-insensitive matching', () => {
expect(getReferrerWithQuery({ utm_source: 'GoOgLe' })).toEqual({
name: 'Google',
type: 'unknown',
url: '',
});
});
it('should handle unknown sources', () => {
expect(getReferrerWithQuery({ utm_source: 'unknown-source' })).toEqual({
name: 'unknown-source',
type: 'unknown',
url: '',
});
});
it('should prioritize utm_source over ref and utm_referrer', () => {
expect(
getReferrerWithQuery({
utm_source: 'google',
ref: 'facebook',
utm_referrer: 'twitter',
}),
).toEqual({
name: 'Google',
type: 'unknown',
url: '',
});
});
});

View File

@@ -1,59 +0,0 @@
import { stripTrailingSlash } from '@openpanel/common';
import referrers from '../referrers';
function getHostname(url: string | undefined) {
if (!url) {
return '';
}
try {
return new URL(url).hostname;
} catch (e) {
return '';
}
}
export function parseReferrer(url: string | undefined) {
const hostname = getHostname(url);
const match = referrers[hostname] ?? referrers[hostname.replace('www.', '')];
return {
name: match?.name ?? '',
type: match?.type ?? 'unknown',
url: stripTrailingSlash(url ?? ''),
};
}
export function getReferrerWithQuery(
query: Record<string, string> | undefined,
) {
if (!query) {
return null;
}
const source = query.utm_source ?? query.ref ?? query.utm_referrer ?? '';
if (source === '') {
return null;
}
const match =
Object.values(referrers).find(
(referrer) => referrer.name.toLowerCase() === source.toLowerCase(),
) || referrers[source];
if (match) {
return {
name: match.name,
type: match.type,
url: '',
};
}
return {
name: source,
type: 'unknown',
url: '',
};
}