diff --git a/db/clickhouse/schema.sql b/db/clickhouse/schema.sql index e9160329..625ee5a3 100644 --- a/db/clickhouse/schema.sql +++ b/db/clickhouse/schema.sql @@ -39,6 +39,7 @@ CREATE TABLE umami.website_event event_name String, tag String, distinct_id String, + visitor_id String, created_at DateTime('UTC'), job_id Nullable(UUID) ) @@ -123,6 +124,7 @@ CREATE TABLE umami.website_event_stats_hourly max_time SimpleAggregateFunction(max, DateTime('UTC')), tag SimpleAggregateFunction(groupArrayArray, Array(String)), distinct_id String, + visitor_id String, created_at Datetime('UTC') ) ENGINE = AggregatingMergeTree @@ -176,6 +178,7 @@ SELECT max_time, tag, distinct_id, + visitor_id, timestamp as created_at FROM (SELECT website_id, @@ -214,6 +217,7 @@ FROM (SELECT max(created_at) max_time, arrayFilter(x -> x != '', groupArray(tag)) tag, distinct_id, + visitor_id, toStartOfHour(created_at) timestamp FROM umami.website_event GROUP BY website_id, @@ -230,6 +234,7 @@ GROUP BY website_id, city, event_type, distinct_id, + visitor_id, timestamp); -- projections @@ -281,3 +286,16 @@ JOIN (SELECT event_id, string_value as currency WHERE positionCaseInsensitive(data_key, 'currency') > 0) c ON c.event_id = ed.event_id WHERE positionCaseInsensitive(data_key, 'revenue') > 0; + +-- identity linking +CREATE TABLE umami.identity_link +( + website_id UUID, + visitor_id String, + distinct_id String, + created_at DateTime('UTC'), + linked_at DateTime('UTC') +) +ENGINE = ReplacingMergeTree(linked_at) +ORDER BY (website_id, visitor_id, distinct_id) +SETTINGS index_granularity = 8192; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index aeb11648..90bfdb52 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -43,6 +43,7 @@ model Session { region String? @db.VarChar(20) city String? @db.VarChar(50) distinctId String? @map("distinct_id") @db.VarChar(50) + visitorId String? @map("visitor_id") @db.VarChar(50) createdAt DateTime? @default(now()) @map("created_at") @db.Timestamptz(6) websiteEvents WebsiteEvent[] @@ -60,6 +61,7 @@ model Session { @@index([websiteId, createdAt, country]) @@index([websiteId, createdAt, region]) @@index([websiteId, createdAt, city]) + @@index([websiteId, visitorId]) @@map("session") } @@ -76,14 +78,15 @@ model Website { updatedAt DateTime? @updatedAt @map("updated_at") @db.Timestamptz(6) deletedAt DateTime? @map("deleted_at") @db.Timestamptz(6) - user User? @relation("user", fields: [userId], references: [id]) - createUser User? @relation("createUser", fields: [createdBy], references: [id]) - team Team? @relation(fields: [teamId], references: [id]) - eventData EventData[] - reports Report[] - revenue Revenue[] - segments Segment[] - sessionData SessionData[] + user User? @relation("user", fields: [userId], references: [id]) + createUser User? @relation("createUser", fields: [createdBy], references: [id]) + team Team? @relation(fields: [teamId], references: [id]) + eventData EventData[] + reports Report[] + revenue Revenue[] + segments Segment[] + sessionData SessionData[] + identityLinks IdentityLink[] @@index([userId]) @@index([teamId]) @@ -316,3 +319,19 @@ model Pixel { @@index([createdAt]) @@map("pixel") } + +model IdentityLink { + id String @id @unique @map("identity_link_id") @db.Uuid + websiteId String @map("website_id") @db.Uuid + visitorId String @map("visitor_id") @db.VarChar(50) + distinctId String @map("distinct_id") @db.VarChar(50) + createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(6) + linkedAt DateTime @default(now()) @updatedAt @map("linked_at") @db.Timestamptz(6) + + website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade) + + @@unique([websiteId, visitorId, distinctId]) + @@index([websiteId, distinctId]) + @@index([websiteId, visitorId]) + @@map("identity_link") +} diff --git a/src/app/api/send/route.ts b/src/app/api/send/route.ts index 2c2085bf..62776c6a 100644 --- a/src/app/api/send/route.ts +++ b/src/app/api/send/route.ts @@ -11,7 +11,7 @@ import { secret, uuid, hash } from '@/lib/crypto'; import { COLLECTION_TYPE, EVENT_TYPE } from '@/lib/constants'; import { anyObjectParam, urlOrPathParam } from '@/lib/schema'; import { safeDecodeURI, safeDecodeURIComponent } from '@/lib/url'; -import { createSession, saveEvent, saveSessionData } from '@/queries/sql'; +import { createSession, saveEvent, saveSessionData, createIdentityLink } from '@/queries/sql'; import { serializeError } from 'serialize-error'; interface Cache { @@ -41,6 +41,7 @@ const schema = z.object({ userAgent: z.string().optional(), timestamp: z.coerce.number().int().optional(), id: z.string().optional(), + vid: z.string().max(50).optional(), }) .refine( data => { @@ -80,6 +81,7 @@ export async function POST(request: Request) { tag, timestamp, id, + vid: visitorId, } = payload; const sourceId = websiteId || pixelId || linkId; @@ -146,6 +148,7 @@ export async function POST(request: Request) { region, city, distinctId: id, + visitorId, createdAt, }); } @@ -226,6 +229,7 @@ export async function POST(request: Request) { // Session distinctId: id, + visitorId, browser, os, device, @@ -265,6 +269,19 @@ export async function POST(request: Request) { createdAt, }); } + + // Create identity link when both visitorId and distinctId are present + // Fire-and-forget to avoid adding latency to the tracking endpoint + if (visitorId && id && websiteId) { + createIdentityLink({ + websiteId, + visitorId, + distinctId: id, + }).catch(e => { + // eslint-disable-next-line no-console + console.error('Failed to create identity link:', e); + }); + } } const token = createToken({ websiteId, sessionId, visitId, iat }, secret()); diff --git a/src/queries/sql/events/saveEvent.ts b/src/queries/sql/events/saveEvent.ts index c8a9cbe9..da7a3549 100644 --- a/src/queries/sql/events/saveEvent.ts +++ b/src/queries/sql/events/saveEvent.ts @@ -25,6 +25,7 @@ export interface SaveEventArgs { // Session distinctId?: string; + visitorId?: string; browser?: string; os?: string; device?: string; @@ -164,6 +165,7 @@ async function clickhouseQuery({ referrerQuery, referrerDomain, distinctId, + visitorId, browser, os, device, @@ -220,6 +222,7 @@ async function clickhouseQuery({ event_name: eventName ? eventName?.substring(0, EVENT_NAME_LENGTH) : null, tag: tag, distinct_id: distinctId, + visitor_id: visitorId, created_at: getUTCString(createdAt), browser, os, diff --git a/src/queries/sql/getWebsiteStats.ts b/src/queries/sql/getWebsiteStats.ts index 4a4bef78..e6d0ef47 100644 --- a/src/queries/sql/getWebsiteStats.ts +++ b/src/queries/sql/getWebsiteStats.ts @@ -37,7 +37,7 @@ async function relationalQuery( ` select cast(coalesce(sum(t.c), 0) as bigint) as "pageviews", - count(distinct t.session_id) as "visitors", + count(distinct coalesce(t.resolved_identity, t.visitor_id, t.session_id::text)) as "visitors", count(distinct t.visit_id) as "visits", coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces", cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime" @@ -45,17 +45,23 @@ async function relationalQuery( select website_event.session_id, website_event.visit_id, + session.visitor_id, + il.distinct_id as "resolved_identity", count(*) as "c", min(website_event.created_at) as "min_time", max(website_event.created_at) as "max_time" from website_event ${cohortQuery} - ${joinSessionQuery} + ${joinSessionQuery} + left join session on session.session_id = website_event.session_id + and session.website_id = website_event.website_id + left join identity_link il on il.visitor_id = session.visitor_id + and il.website_id = session.website_id where website_event.website_id = {{websiteId::uuid}} and website_event.created_at between {{startDate}} and {{endDate}} and website_event.event_type != 2 ${filterQuery} - group by 1, 2 + group by 1, 2, 3, 4 ) as t `, queryParams, @@ -79,47 +85,55 @@ async function clickhouseQuery( sql = ` select sum(t.c) as "pageviews", - uniq(t.session_id) as "visitors", + uniq(coalesce(t.resolved_identity, t.visitor_id, toString(t.session_id))) as "visitors", uniq(t.visit_id) as "visits", sum(if(t.c = 1, 1, 0)) as "bounces", sum(max_time-min_time) as "totaltime" from ( select - session_id, - visit_id, + we.session_id, + we.visit_id, + we.visitor_id, + il.distinct_id as resolved_identity, count(*) c, - min(created_at) min_time, - max(created_at) max_time - from website_event + min(we.created_at) min_time, + max(we.created_at) max_time + from website_event we ${cohortQuery} - where website_id = {websiteId:UUID} - and created_at between {startDate:DateTime64} and {endDate:DateTime64} - and event_type != 2 + left join identity_link final il on il.visitor_id = we.visitor_id + and il.website_id = we.website_id + where we.website_id = {websiteId:UUID} + and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} + and we.event_type != 2 ${filterQuery} - group by session_id, visit_id + group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id ) as t; `; } else { sql = ` select sum(t.c) as "pageviews", - uniq(session_id) as "visitors", + uniq(coalesce(resolved_identity, visitor_id, toString(session_id))) as "visitors", uniq(visit_id) as "visits", sumIf(1, t.c = 1) as "bounces", sum(max_time-min_time) as "totaltime" from (select - session_id, - visit_id, - sum(views) c, - min(min_time) min_time, - max(max_time) max_time - from website_event_stats_hourly "website_event" + we.session_id, + we.visit_id, + we.visitor_id, + il.distinct_id as resolved_identity, + sum(we.views) c, + min(we.min_time) min_time, + max(we.max_time) max_time + from website_event_stats_hourly we ${cohortQuery} - where website_id = {websiteId:UUID} - and created_at between {startDate:DateTime64} and {endDate:DateTime64} - and event_type != 2 + left join identity_link final il on il.visitor_id = we.visitor_id + and il.website_id = we.website_id + where we.website_id = {websiteId:UUID} + and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} + and we.event_type != 2 ${filterQuery} - group by session_id, visit_id + group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id ) as t; `; } diff --git a/src/queries/sql/identity/createIdentityLink.ts b/src/queries/sql/identity/createIdentityLink.ts new file mode 100644 index 00000000..61c49cbf --- /dev/null +++ b/src/queries/sql/identity/createIdentityLink.ts @@ -0,0 +1,76 @@ +/** + * Identity Stitching - Links anonymous browser sessions to authenticated user identities + * + * Design decisions: + * - One visitor can link to multiple distinct_ids (user logs into different accounts) + * - One distinct_id can link to multiple visitors (user on multiple devices/browsers) + * - Links are additive and never invalidated (preserves historical journey) + * - Uses ReplacingMergeTree in ClickHouse with linked_at for deduplication + * - Upsert pattern ensures idempotency for repeated identify() calls + * + * Edge cases handled: + * - Safari private browsing: visitorId will be undefined, no link created + * - localStorage cleared: new visitorId generated, creates new link + * - Multiple tabs: same visitorId shared via localStorage + */ +import { uuid } from '@/lib/crypto'; +import prisma from '@/lib/prisma'; +import clickhouse from '@/lib/clickhouse'; +import kafka from '@/lib/kafka'; +import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db'; + +export interface CreateIdentityLinkArgs { + websiteId: string; + visitorId: string; + distinctId: string; +} + +export async function createIdentityLink(data: CreateIdentityLinkArgs) { + return runQuery({ + [PRISMA]: () => relationalQuery(data), + [CLICKHOUSE]: () => clickhouseQuery(data), + }); +} + +async function relationalQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) { + const { client } = prisma; + + return client.identityLink.upsert({ + where: { + websiteId_visitorId_distinctId: { + websiteId, + visitorId, + distinctId, + }, + }, + update: { + linkedAt: new Date(), + }, + create: { + id: uuid(), + websiteId, + visitorId, + distinctId, + }, + }); +} + +async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) { + const { insert, getUTCString } = clickhouse; + const { sendMessage } = kafka; + + const now = getUTCString(new Date()); + const message = { + website_id: websiteId, + visitor_id: visitorId, + distinct_id: distinctId, + created_at: now, + linked_at: now, + }; + + if (kafka.enabled) { + await sendMessage('identity_link', message); + } else { + await insert('identity_link', [message]); + } +} diff --git a/src/queries/sql/identity/getLinkedVisitorIds.ts b/src/queries/sql/identity/getLinkedVisitorIds.ts new file mode 100644 index 00000000..a328ff82 --- /dev/null +++ b/src/queries/sql/identity/getLinkedVisitorIds.ts @@ -0,0 +1,71 @@ +/** + * Resolves all visitor IDs linked to a given distinct_id (authenticated user) + * + * Use cases (for future implementation): + * - User journey reports: aggregate sessions across devices + * - Cohort analysis: include all linked sessions + * - Retroactive attribution: credit conversions to original anonymous session + * + * Note: Uses FINAL keyword in ClickHouse to ensure deduplication from ReplacingMergeTree + */ +import prisma from '@/lib/prisma'; +import clickhouse from '@/lib/clickhouse'; +import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db'; + +export interface GetLinkedVisitorIdsArgs { + websiteId: string; + distinctId: string; +} + +export interface LinkedVisitorId { + visitorId: string; + linkedAt: Date; +} + +export async function getLinkedVisitorIds( + data: GetLinkedVisitorIdsArgs, +): Promise { + return runQuery({ + [PRISMA]: () => relationalQuery(data), + [CLICKHOUSE]: () => clickhouseQuery(data), + }); +} + +async function relationalQuery({ + websiteId, + distinctId, +}: GetLinkedVisitorIdsArgs): Promise { + const { client } = prisma; + + const links = await client.identityLink.findMany({ + where: { + websiteId, + distinctId, + }, + select: { + visitorId: true, + linkedAt: true, + }, + }); + + return links; +} + +async function clickhouseQuery({ + websiteId, + distinctId, +}: GetLinkedVisitorIdsArgs): Promise { + const { rawQuery } = clickhouse; + + return rawQuery( + ` + select + visitor_id as visitorId, + linked_at as linkedAt + from identity_link final + where website_id = {websiteId:UUID} + and distinct_id = {distinctId:String} + `, + { websiteId, distinctId }, + ); +} diff --git a/src/queries/sql/identity/index.ts b/src/queries/sql/identity/index.ts new file mode 100644 index 00000000..cd81b3b9 --- /dev/null +++ b/src/queries/sql/identity/index.ts @@ -0,0 +1,2 @@ +export * from './createIdentityLink'; +export * from './getLinkedVisitorIds'; diff --git a/src/queries/sql/index.ts b/src/queries/sql/index.ts index 682ac6d2..5ac66973 100644 --- a/src/queries/sql/index.ts +++ b/src/queries/sql/index.ts @@ -39,3 +39,5 @@ export * from './getValues'; export * from './getWebsiteDateRange'; export * from './getWebsiteStats'; export * from './getWeeklyTraffic'; +export * from './identity/createIdentityLink'; +export * from './identity/getLinkedVisitorIds'; diff --git a/src/queries/sql/sessions/createSession.ts b/src/queries/sql/sessions/createSession.ts index b5106a54..013ca412 100644 --- a/src/queries/sql/sessions/createSession.ts +++ b/src/queries/sql/sessions/createSession.ts @@ -20,6 +20,7 @@ export async function createSession(data: Prisma.SessionCreateInput) { region, city, distinct_id, + visitor_id, created_at ) values ( @@ -34,6 +35,7 @@ export async function createSession(data: Prisma.SessionCreateInput) { {{region}}, {{city}}, {{distinctId}}, + {{visitorId}}, {{createdAt}} ) on conflict (session_id) do nothing diff --git a/src/tracker/index.js b/src/tracker/index.js index a9966198..f60cf7c2 100644 --- a/src/tracker/index.js +++ b/src/tracker/index.js @@ -29,6 +29,7 @@ const excludeHash = attr(_data + 'exclude-hash') === _true; const domain = attr(_data + 'domains') || ''; const credentials = attr(_data + 'fetch-credentials') || 'omit'; + const identityStitching = attr(_data + 'identity-stitching') !== _false; const domains = domain.split(',').map(n => n.trim()); const host = @@ -41,6 +42,38 @@ /* Helper functions */ + /** + * Identity Stitching: Generates a persistent visitor ID stored in localStorage. + * When combined with identify(), links anonymous sessions to authenticated users. + * Gracefully degrades when localStorage is unavailable (Safari private browsing). + */ + const generateUUID = () => + 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => { + const r = (Math.random() * 16) | 0; + return (c === 'x' ? r : (r & 0x3) | 0x8).toString(16); + }); + + const getVisitorId = () => { + if (!identityStitching || !localStorage) return undefined; + + try { + const storageKey = 'umami.visitor'; + let vid = localStorage.getItem(storageKey); + + if (!vid) { + vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID(); + localStorage.setItem(storageKey, vid); + } + + return vid; + } catch { + // localStorage access throws in Safari private browsing + return undefined; + } + }; + + const visitorId = getVisitorId(); + const normalize = raw => { if (!raw) return raw; try { @@ -63,6 +96,7 @@ referrer: currentRef, tag, id: identity ? identity : undefined, + vid: visitorId, }); const hasDoNotTrack = () => { @@ -141,12 +175,21 @@ /* Tracking functions */ - const trackingDisabled = () => - disabled || - !website || - (localStorage && localStorage.getItem('umami.disabled')) || - (domain && !domains.includes(hostname)) || - (dnt && hasDoNotTrack()); + const trackingDisabled = () => { + let storageDisabled = false; + try { + storageDisabled = localStorage && localStorage.getItem('umami.disabled'); + } catch { + // localStorage throws in Safari private browsing + } + return ( + disabled || + !website || + storageDisabled || + (domain && !domains.includes(hostname)) || + (dnt && hasDoNotTrack()) + ); + }; const send = async (payload, type = 'event') => { if (trackingDisabled()) return;