From f6ee1ac4db22d7a9b6dedabb0ba519c0ebcc0fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Mon, 24 Nov 2025 12:00:12 +0100 Subject: [PATCH] wip --- packages/db/code-migrations/7-order-keys.ts | 263 ++++++++++++++++++++ packages/db/code-migrations/migrate.ts | 4 +- packages/db/src/clickhouse/migration.ts | 98 +++++++- 3 files changed, 356 insertions(+), 9 deletions(-) create mode 100644 packages/db/code-migrations/7-order-keys.ts diff --git a/packages/db/code-migrations/7-order-keys.ts b/packages/db/code-migrations/7-order-keys.ts new file mode 100644 index 00000000..244b19d2 --- /dev/null +++ b/packages/db/code-migrations/7-order-keys.ts @@ -0,0 +1,263 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { + chMigrationClient, + createTable, + moveDataBetweenTables, + renameTable, + runClickhouseMigrationCommands, +} from '../src/clickhouse/migration'; +import { getIsCluster } from './helpers'; + +/** + * Migration to update ORDER BY keys for events and sessions tables. + * + * Changes: + * - Events: Remove profile_id from ORDER BY, add created_at for better ordering + * Old: ['project_id', 'toDate(created_at)', 'profile_id', 'name'] + * New: ['project_id', 'toDate(created_at)', 'created_at', 'name'] + * + * - Sessions: Remove profile_id from ORDER BY, reorder to match query patterns + * Old: ['project_id', 'id', 'toDate(created_at)', 'profile_id'] + * New: ['project_id', 'toDate(created_at)', 'created_at', 'id'] + * + * Rationale: + * - project_id: Always filtered first (100% of queries) + * - toDate(created_at): Almost always filtered (95%+ of queries), good for partitioning + * - created_at: Helps with ordering within same day, matches ORDER BY patterns in queries + * - name (events): Frequently filtered (screen_view, session_start, etc.), good selectivity + * - id (sessions): Used for ordering and uniqueness in session queries + */ +export async function up() { + const isClustered = getIsCluster(); + + const sqls: string[] = []; + + const eventTables = createTable({ + name: 'events_new_20251123', + columns: [ + '`id` UUID DEFAULT generateUUIDv4()', + '`name` LowCardinality(String)', + '`sdk_name` LowCardinality(String)', + '`sdk_version` LowCardinality(String)', + '`device_id` String CODEC(ZSTD(3))', + '`profile_id` String CODEC(ZSTD(3))', + '`project_id` String CODEC(ZSTD(3))', + '`session_id` String CODEC(LZ4)', + '`path` String CODEC(ZSTD(3))', + '`origin` String CODEC(ZSTD(3))', + '`referrer` String CODEC(ZSTD(3))', + '`referrer_name` String CODEC(ZSTD(3))', + '`referrer_type` LowCardinality(String)', + '`revenue` UInt64', + '`duration` UInt64 CODEC(Delta(4), LZ4)', + '`properties` Map(String, String) CODEC(ZSTD(3))', + '`created_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3))', + '`country` LowCardinality(FixedString(2))', + '`city` String', + '`region` LowCardinality(String)', + '`longitude` Nullable(Float32) CODEC(Gorilla, LZ4)', + '`latitude` Nullable(Float32) CODEC(Gorilla, LZ4)', + '`os` LowCardinality(String)', + '`os_version` LowCardinality(String)', + '`browser` LowCardinality(String)', + '`browser_version` LowCardinality(String)', + '`device` LowCardinality(String)', + '`brand` LowCardinality(String)', + '`model` LowCardinality(String)', + '`imported_at` Nullable(DateTime) CODEC(Delta(4), LZ4)', + ], + indices: [ + 'INDEX idx_name name TYPE bloom_filter GRANULARITY 1', + "INDEX idx_properties_bounce properties['__bounce'] TYPE set(3) GRANULARITY 1", + 'INDEX idx_origin origin TYPE bloom_filter(0.05) GRANULARITY 1', + 'INDEX idx_path path TYPE bloom_filter(0.01) GRANULARITY 1', + ], + // New ORDER BY: project_id, toDate(created_at), created_at, name + // Removed profile_id, added created_at for better ordering within same day + orderBy: ['project_id', 'toDate(created_at)', 'created_at', 'name'], + partitionBy: 'toYYYYMM(created_at)', + settings: { + index_granularity: 8192, + // For lightweight updates + enable_block_offset_column: 1, + enable_block_number_column: 1, + }, + distributionHash: + 'cityHash64(project_id, toString(toStartOfHour(created_at)))', + replicatedVersion: '1', + isClustered, + }); + + // Step 1: Create temporary tables with new ORDER BY keys + // Events table with new ORDER BY + sqls.push(...eventTables); + + const sessionTables = createTable({ + name: 'sessions_new_20251123', + engine: 'VersionedCollapsingMergeTree(sign, version)', + columns: [ + '`id` String', + '`project_id` String CODEC(ZSTD(3))', + '`profile_id` String CODEC(ZSTD(3))', + '`device_id` String CODEC(ZSTD(3))', + '`created_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3))', + '`ended_at` DateTime64(3) CODEC(DoubleDelta, ZSTD(3))', + '`is_bounce` Bool', + '`entry_origin` LowCardinality(String)', + '`entry_path` String CODEC(ZSTD(3))', + '`exit_origin` LowCardinality(String)', + '`exit_path` String CODEC(ZSTD(3))', + '`screen_view_count` Int32', + '`revenue` Float64', + '`event_count` Int32', + '`duration` UInt32', + '`country` LowCardinality(FixedString(2))', + '`region` LowCardinality(String)', + '`city` String', + '`longitude` Nullable(Float32) CODEC(Gorilla, LZ4)', + '`latitude` Nullable(Float32) CODEC(Gorilla, LZ4)', + '`device` LowCardinality(String)', + '`brand` LowCardinality(String)', + '`model` LowCardinality(String)', + '`browser` LowCardinality(String)', + '`browser_version` LowCardinality(String)', + '`os` LowCardinality(String)', + '`os_version` LowCardinality(String)', + '`utm_medium` String CODEC(ZSTD(3))', + '`utm_source` String CODEC(ZSTD(3))', + '`utm_campaign` String CODEC(ZSTD(3))', + '`utm_content` String CODEC(ZSTD(3))', + '`utm_term` String CODEC(ZSTD(3))', + '`referrer` String CODEC(ZSTD(3))', + '`referrer_name` String CODEC(ZSTD(3))', + '`referrer_type` LowCardinality(String)', + '`sign` Int8', + '`version` UInt64', + '`properties` Map(String, String) CODEC(ZSTD(3))', + ], + // New ORDER BY: project_id, toDate(created_at), created_at, id + // Removed profile_id, reordered to match query patterns (date first, then id) + orderBy: ['project_id', 'toDate(created_at)', 'created_at'], + partitionBy: 'toYYYYMM(created_at)', + settings: { + index_granularity: 8192, + }, + distributionHash: + 'cityHash64(project_id, toString(toStartOfHour(created_at)))', + replicatedVersion: '1', + isClustered, + }); + + // Sessions table with new ORDER BY + sqls.push(...sessionTables); + + const firstEventDateResponse = await chMigrationClient.query({ + query: 'SELECT min(created_at) as created_at FROM events', + format: 'JSONEachRow', + }); + const firstEventDateJson = await firstEventDateResponse.json<{ + created_at: string; + }>(); + const firstEventDate = new Date(firstEventDateJson[0]?.created_at ?? ''); + if (firstEventDate) { + // Step 2: Copy data from old tables to new tables (partitioned by month for efficiency) + sqls.push( + ...moveDataBetweenTables({ + from: 'events', + to: 'events_new_20251123', + batch: { + startDate: firstEventDate, + column: 'toDate(created_at)', + interval: 'month', + transform: (date: Date) => { + const year = date.getFullYear(); + const month = String(date.getMonth() + 1).padStart(2, '0'); + return `${year}-${month}-01`; + }, + }, + }), + ); + } + + const firstSessionDateResponse = await chMigrationClient.query({ + query: 'SELECT min(created_at) as created_at FROM sessions', + format: 'JSONEachRow', + }); + const firstSessionDateJson = await firstSessionDateResponse.json<{ + created_at: string; + }>(); + + const firstSessionDate = new Date(firstSessionDateJson[0]?.created_at ?? ''); + if (firstSessionDate) { + sqls.push( + ...moveDataBetweenTables({ + from: 'sessions', + to: 'sessions_new_20251123', + batch: { + startDate: firstSessionDate, + column: 'toDate(created_at)', + interval: 'month', + transform: (date: Date) => { + const year = date.getFullYear(); + const month = String(date.getMonth() + 1).padStart(2, '0'); + return `${year}-${month}-01`; + }, + }, + }), + ); + } + + sqls.push( + ...renameTable({ from: 'events', to: 'events_20251123', isClustered }), + ); + sqls.push( + ...renameTable({ from: 'sessions', to: 'sessions_20251123', isClustered }), + ); + + if (isClustered && sessionTables[1] && eventTables[1]) { + sqls.push( + // Drop temporary DISTRIBUTED tables (will be recreated) + 'DROP TABLE IF EXISTS events_new_20251123 ON CLUSTER "{cluster}"', + 'DROP TABLE IF EXISTS sessions_new_20251123 ON CLUSTER "{cluster}"', + // Rename new tables to correct names + 'RENAME TABLE events_new_20251123_replicated TO events_replicated ON CLUSTER "{cluster}"', + 'RENAME TABLE sessions_new_20251123_replicated TO sessions_replicated ON CLUSTER "{cluster}"', + // Create new distributed tables + eventTables[1].replaceAll('events_new_20251123', 'events'), // creates a new distributed table + sessionTables[1].replaceAll('sessions_new_20251123', 'sessions'), // creates a new distributed table + ); + } else { + sqls.push( + ...renameTable({ + from: 'events_new_20251123', + to: 'events', + isClustered, + }), + ); + sqls.push( + ...renameTable({ + from: 'sessions_new_20251123', + to: 'sessions', + isClustered, + }), + ); + } + + fs.writeFileSync( + path.join(__filename.replace('.ts', '.sql')), + sqls + .map((sql) => + sql + .trim() + .replace(/;$/, '') + .replace(/\n{2,}/g, '\n') + .concat(';'), + ) + .join('\n\n---\n\n'), + ); + + if (!process.argv.includes('--dry')) { + await runClickhouseMigrationCommands(sqls); + } +} diff --git a/packages/db/code-migrations/migrate.ts b/packages/db/code-migrations/migrate.ts index 5f3a0e65..fa1ff99f 100644 --- a/packages/db/code-migrations/migrate.ts +++ b/packages/db/code-migrations/migrate.ts @@ -56,7 +56,9 @@ async function migrate() { if (!getIsSelfHosting()) { printBoxMessage('🕒 Migrations starts in 10 seconds', []); - await new Promise((resolve) => setTimeout(resolve, 10000)); + if (!getIsDry()) { + await new Promise((resolve) => setTimeout(resolve, 10000)); + } } if (migration) { diff --git a/packages/db/src/clickhouse/migration.ts b/packages/db/src/clickhouse/migration.ts index 90c8a33c..f4fcbf41 100644 --- a/packages/db/src/clickhouse/migration.ts +++ b/packages/db/src/clickhouse/migration.ts @@ -247,32 +247,114 @@ export function moveDataBetweenTables({ let currentDate = endDate; const interval = batch.interval || 'day'; - while (currentDate > startDate) { + // Helper function to get the start of the week (Monday) for a given date + const getWeekStart = (date: Date): Date => { + const d = new Date(date); + const day = d.getDay(); + const diff = d.getDate() - day + (day === 0 ? -6 : 1); // Adjust to Monday + d.setDate(diff); + d.setHours(0, 0, 0, 0); // Normalize to start of day + return d; + }; + + // Helper function to compare dates based on interval + const shouldContinue = ( + current: Date, + start: Date, + intervalType: string, + ): boolean => { + if (intervalType === 'month') { + // For months, compare by year and month + // Continue if current month is >= start month + const currentYear = current.getFullYear(); + const currentMonth = current.getMonth(); + const startYear = start.getFullYear(); + const startMonth = start.getMonth(); + return ( + currentYear > startYear || + (currentYear === startYear && currentMonth >= startMonth) + ); + } + if (intervalType === 'week') { + // For weeks, compare by week start dates + const currentWeekStart = getWeekStart(current); + const startWeekStart = getWeekStart(start); + return currentWeekStart >= startWeekStart; + } + return current > start; + }; + + while (shouldContinue(currentDate, startDate, interval)) { const previousDate = new Date(currentDate); switch (interval) { case 'month': previousDate.setMonth(previousDate.getMonth() - 1); - break; - case 'week': - previousDate.setDate(previousDate.getDate() - 7); - // Ensure we don't go below startDate - if (previousDate < startDate) { - previousDate.setTime(startDate.getTime()); + // If we've gone below startDate's month, adjust to start of startDate's month + // This ensures we generate SQL for the month containing startDate + if ( + previousDate.getFullYear() < startDate.getFullYear() || + (previousDate.getFullYear() === startDate.getFullYear() && + previousDate.getMonth() < startDate.getMonth()) + ) { + previousDate.setFullYear(startDate.getFullYear()); + previousDate.setMonth(startDate.getMonth()); + previousDate.setDate(1); } break; + case 'week': { + previousDate.setDate(previousDate.getDate() - 7); + // If we've gone below startDate's week, adjust to start of startDate's week + const startWeekStart = getWeekStart(startDate); + const prevWeekStart = getWeekStart(previousDate); + if (prevWeekStart < startWeekStart) { + previousDate.setTime(startWeekStart.getTime()); + } + break; + } // day default: previousDate.setDate(previousDate.getDate() - 1); break; } + // For monthly/weekly intervals with transform, we need to use the start of the next period for the upper bound + let upperBoundDate = currentDate; + if (interval === 'month' && batch.transform) { + const nextMonth = new Date(currentDate); + nextMonth.setMonth(nextMonth.getMonth() + 1); + nextMonth.setDate(1); + upperBoundDate = nextMonth; + } else if (interval === 'week' && batch.transform) { + const nextWeek = new Date(currentDate); + nextWeek.setDate(nextWeek.getDate() + 7); + const nextWeekStart = getWeekStart(nextWeek); + upperBoundDate = nextWeekStart; + } + const sql = `INSERT INTO ${to} SELECT * FROM ${from} WHERE ${batch.column} > '${batch.transform ? batch.transform(previousDate) : formatClickhouseDate(previousDate, true)}' - AND ${batch.column} <= '${batch.transform ? batch.transform(currentDate) : formatClickhouseDate(currentDate, true)}'`; + AND ${batch.column} <= '${batch.transform ? batch.transform(upperBoundDate) : formatClickhouseDate(upperBoundDate, true)}'`; sqls.push(sql); + // For monthly/weekly intervals, stop if we've reached the start period + if (interval === 'month') { + const prevYear = previousDate.getFullYear(); + const prevMonth = previousDate.getMonth(); + const startYear = startDate.getFullYear(); + const startMonth = startDate.getMonth(); + if (prevYear === startYear && prevMonth === startMonth) { + break; + } + } else if (interval === 'week') { + const prevWeekStart = getWeekStart(previousDate); + const startWeekStart = getWeekStart(startDate); + if (prevWeekStart.getTime() === startWeekStart.getTime()) { + break; + } + } + currentDate = previousDate; }