From a902a87c083d751646ceaeaf7213eed01ee4a152 Mon Sep 17 00:00:00 2001 From: Arthur Sepiol Date: Wed, 3 Dec 2025 16:06:54 +0300 Subject: [PATCH 1/3] feat: implement identity stitching for session linking (#3820) Adds automatic session linking/identity stitching to link anonymous browsing sessions with authenticated user sessions. ## Changes ### Database Schema - Add `identity_link` table (PostgreSQL + ClickHouse) to store mappings between visitor IDs and authenticated user IDs - Add `visitor_id` field to `Session` model - Add `visitor_id` column to ClickHouse `website_event` table ### Client Tracker - Generate and persist `visitor_id` in localStorage - Include `vid` in all tracking payloads - Support opt-out via `data-identity-stitching="false"` attribute ### API - Accept `vid` parameter in `/api/send` endpoint - Auto-create identity links when `identify()` is called with both visitor_id and distinct_id - Store visitor_id in sessions and events ### Query Updates - Update `getWebsiteStats` to deduplicate visitors by resolved identity - Visitors who browse anonymously then log in are now counted as one user ## Usage When a user logs in, call `umami.identify(userId)`. If identity stitching is enabled (default), the tracker automatically links the anonymous visitor_id to the authenticated userId. Stats queries then resolve linked identities to accurately count unique visitors. Resolves #3820 --- db/clickhouse/schema.sql | 17 ++++++ prisma/schema.prisma | 34 ++++++++--- src/app/api/send/route.ts | 15 ++++- src/queries/sql/events/saveEvent.ts | 3 + src/queries/sql/getWebsiteStats.ts | 59 ++++++++++-------- .../sql/identity/createIdentityLink.ts | 59 ++++++++++++++++++ .../sql/identity/getLinkedVisitorIds.ts | 61 +++++++++++++++++++ src/queries/sql/identity/index.ts | 2 + src/queries/sql/index.ts | 2 + src/queries/sql/sessions/createSession.ts | 2 + src/tracker/index.js | 24 ++++++++ 11 files changed, 245 insertions(+), 33 deletions(-) create mode 100644 src/queries/sql/identity/createIdentityLink.ts create mode 100644 src/queries/sql/identity/getLinkedVisitorIds.ts create mode 100644 src/queries/sql/identity/index.ts diff --git a/db/clickhouse/schema.sql b/db/clickhouse/schema.sql index e9160329..922cfe67 100644 --- a/db/clickhouse/schema.sql +++ b/db/clickhouse/schema.sql @@ -39,6 +39,7 @@ CREATE TABLE umami.website_event event_name String, tag String, distinct_id String, + visitor_id String, created_at DateTime('UTC'), job_id Nullable(UUID) ) @@ -123,6 +124,7 @@ CREATE TABLE umami.website_event_stats_hourly max_time SimpleAggregateFunction(max, DateTime('UTC')), tag SimpleAggregateFunction(groupArrayArray, Array(String)), distinct_id String, + visitor_id String, created_at Datetime('UTC') ) ENGINE = AggregatingMergeTree @@ -176,6 +178,7 @@ SELECT max_time, tag, distinct_id, + visitor_id, timestamp as created_at FROM (SELECT website_id, @@ -214,6 +217,7 @@ FROM (SELECT max(created_at) max_time, arrayFilter(x -> x != '', groupArray(tag)) tag, distinct_id, + visitor_id, toStartOfHour(created_at) timestamp FROM umami.website_event GROUP BY website_id, @@ -230,6 +234,7 @@ GROUP BY website_id, city, event_type, distinct_id, + visitor_id, timestamp); -- projections @@ -281,3 +286,15 @@ JOIN (SELECT event_id, string_value as currency WHERE positionCaseInsensitive(data_key, 'currency') > 0) c ON c.event_id = ed.event_id WHERE positionCaseInsensitive(data_key, 'revenue') > 0; + +-- identity linking +CREATE TABLE umami.identity_link +( + website_id UUID, + visitor_id String, + distinct_id String, + linked_at DateTime('UTC') +) +ENGINE = ReplacingMergeTree(linked_at) +ORDER BY (website_id, visitor_id, distinct_id) +SETTINGS index_granularity = 8192; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index aeb11648..471963eb 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -43,6 +43,7 @@ model Session { region String? @db.VarChar(20) city String? @db.VarChar(50) distinctId String? @map("distinct_id") @db.VarChar(50) + visitorId String? @map("visitor_id") @db.VarChar(50) createdAt DateTime? @default(now()) @map("created_at") @db.Timestamptz(6) websiteEvents WebsiteEvent[] @@ -60,6 +61,7 @@ model Session { @@index([websiteId, createdAt, country]) @@index([websiteId, createdAt, region]) @@index([websiteId, createdAt, city]) + @@index([websiteId, visitorId]) @@map("session") } @@ -76,14 +78,15 @@ model Website { updatedAt DateTime? @updatedAt @map("updated_at") @db.Timestamptz(6) deletedAt DateTime? @map("deleted_at") @db.Timestamptz(6) - user User? @relation("user", fields: [userId], references: [id]) - createUser User? @relation("createUser", fields: [createdBy], references: [id]) - team Team? @relation(fields: [teamId], references: [id]) - eventData EventData[] - reports Report[] - revenue Revenue[] - segments Segment[] - sessionData SessionData[] + user User? @relation("user", fields: [userId], references: [id]) + createUser User? @relation("createUser", fields: [createdBy], references: [id]) + team Team? @relation(fields: [teamId], references: [id]) + eventData EventData[] + reports Report[] + revenue Revenue[] + segments Segment[] + sessionData SessionData[] + identityLinks IdentityLink[] @@index([userId]) @@index([teamId]) @@ -316,3 +319,18 @@ model Pixel { @@index([createdAt]) @@map("pixel") } + +model IdentityLink { + id String @id @unique @map("identity_link_id") @db.Uuid + websiteId String @map("website_id") @db.Uuid + visitorId String @map("visitor_id") @db.VarChar(50) + distinctId String @map("distinct_id") @db.VarChar(50) + linkedAt DateTime @default(now()) @map("linked_at") @db.Timestamptz(6) + + website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade) + + @@unique([websiteId, visitorId, distinctId]) + @@index([websiteId, distinctId]) + @@index([websiteId, visitorId]) + @@map("identity_link") +} diff --git a/src/app/api/send/route.ts b/src/app/api/send/route.ts index 2c2085bf..bc7eb4a8 100644 --- a/src/app/api/send/route.ts +++ b/src/app/api/send/route.ts @@ -11,7 +11,7 @@ import { secret, uuid, hash } from '@/lib/crypto'; import { COLLECTION_TYPE, EVENT_TYPE } from '@/lib/constants'; import { anyObjectParam, urlOrPathParam } from '@/lib/schema'; import { safeDecodeURI, safeDecodeURIComponent } from '@/lib/url'; -import { createSession, saveEvent, saveSessionData } from '@/queries/sql'; +import { createSession, saveEvent, saveSessionData, createIdentityLink } from '@/queries/sql'; import { serializeError } from 'serialize-error'; interface Cache { @@ -41,6 +41,7 @@ const schema = z.object({ userAgent: z.string().optional(), timestamp: z.coerce.number().int().optional(), id: z.string().optional(), + vid: z.string().max(50).optional(), }) .refine( data => { @@ -80,6 +81,7 @@ export async function POST(request: Request) { tag, timestamp, id, + vid: visitorId, } = payload; const sourceId = websiteId || pixelId || linkId; @@ -146,6 +148,7 @@ export async function POST(request: Request) { region, city, distinctId: id, + visitorId, createdAt, }); } @@ -226,6 +229,7 @@ export async function POST(request: Request) { // Session distinctId: id, + visitorId, browser, os, device, @@ -265,6 +269,15 @@ export async function POST(request: Request) { createdAt, }); } + + // Create identity link when both visitorId and distinctId are present + if (visitorId && id && websiteId) { + await createIdentityLink({ + websiteId, + visitorId, + distinctId: id, + }); + } } const token = createToken({ websiteId, sessionId, visitId, iat }, secret()); diff --git a/src/queries/sql/events/saveEvent.ts b/src/queries/sql/events/saveEvent.ts index c8a9cbe9..da7a3549 100644 --- a/src/queries/sql/events/saveEvent.ts +++ b/src/queries/sql/events/saveEvent.ts @@ -25,6 +25,7 @@ export interface SaveEventArgs { // Session distinctId?: string; + visitorId?: string; browser?: string; os?: string; device?: string; @@ -164,6 +165,7 @@ async function clickhouseQuery({ referrerQuery, referrerDomain, distinctId, + visitorId, browser, os, device, @@ -220,6 +222,7 @@ async function clickhouseQuery({ event_name: eventName ? eventName?.substring(0, EVENT_NAME_LENGTH) : null, tag: tag, distinct_id: distinctId, + visitor_id: visitorId, created_at: getUTCString(createdAt), browser, os, diff --git a/src/queries/sql/getWebsiteStats.ts b/src/queries/sql/getWebsiteStats.ts index 4a4bef78..42571d14 100644 --- a/src/queries/sql/getWebsiteStats.ts +++ b/src/queries/sql/getWebsiteStats.ts @@ -37,7 +37,7 @@ async function relationalQuery( ` select cast(coalesce(sum(t.c), 0) as bigint) as "pageviews", - count(distinct t.session_id) as "visitors", + count(distinct coalesce(t.resolved_identity, t.session_id::text)) as "visitors", count(distinct t.visit_id) as "visits", coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces", cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime" @@ -45,17 +45,22 @@ async function relationalQuery( select website_event.session_id, website_event.visit_id, + il.distinct_id as "resolved_identity", count(*) as "c", min(website_event.created_at) as "min_time", max(website_event.created_at) as "max_time" from website_event ${cohortQuery} - ${joinSessionQuery} + ${joinSessionQuery} + left join session on session.session_id = website_event.session_id + and session.website_id = website_event.website_id + left join identity_link il on il.visitor_id = session.visitor_id + and il.website_id = session.website_id where website_event.website_id = {{websiteId::uuid}} and website_event.created_at between {{startDate}} and {{endDate}} and website_event.event_type != 2 ${filterQuery} - group by 1, 2 + group by 1, 2, 3 ) as t `, queryParams, @@ -79,47 +84,53 @@ async function clickhouseQuery( sql = ` select sum(t.c) as "pageviews", - uniq(t.session_id) as "visitors", + uniq(coalesce(t.resolved_identity, t.session_id)) as "visitors", uniq(t.visit_id) as "visits", sum(if(t.c = 1, 1, 0)) as "bounces", sum(max_time-min_time) as "totaltime" from ( select - session_id, - visit_id, + we.session_id, + we.visit_id, + il.distinct_id as resolved_identity, count(*) c, - min(created_at) min_time, - max(created_at) max_time - from website_event + min(we.created_at) min_time, + max(we.created_at) max_time + from website_event we ${cohortQuery} - where website_id = {websiteId:UUID} - and created_at between {startDate:DateTime64} and {endDate:DateTime64} - and event_type != 2 + left join identity_link il on il.visitor_id = we.visitor_id + and il.website_id = we.website_id + where we.website_id = {websiteId:UUID} + and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} + and we.event_type != 2 ${filterQuery} - group by session_id, visit_id + group by we.session_id, we.visit_id, il.distinct_id ) as t; `; } else { sql = ` select sum(t.c) as "pageviews", - uniq(session_id) as "visitors", + uniq(coalesce(resolved_identity, session_id)) as "visitors", uniq(visit_id) as "visits", sumIf(1, t.c = 1) as "bounces", sum(max_time-min_time) as "totaltime" from (select - session_id, - visit_id, - sum(views) c, - min(min_time) min_time, - max(max_time) max_time - from website_event_stats_hourly "website_event" + we.session_id, + we.visit_id, + il.distinct_id as resolved_identity, + sum(we.views) c, + min(we.min_time) min_time, + max(we.max_time) max_time + from website_event_stats_hourly we ${cohortQuery} - where website_id = {websiteId:UUID} - and created_at between {startDate:DateTime64} and {endDate:DateTime64} - and event_type != 2 + left join identity_link il on il.visitor_id = we.visitor_id + and il.website_id = we.website_id + where we.website_id = {websiteId:UUID} + and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} + and we.event_type != 2 ${filterQuery} - group by session_id, visit_id + group by we.session_id, we.visit_id, il.distinct_id ) as t; `; } diff --git a/src/queries/sql/identity/createIdentityLink.ts b/src/queries/sql/identity/createIdentityLink.ts new file mode 100644 index 00000000..29a21a88 --- /dev/null +++ b/src/queries/sql/identity/createIdentityLink.ts @@ -0,0 +1,59 @@ +import { uuid } from '@/lib/crypto'; +import prisma from '@/lib/prisma'; +import clickhouse from '@/lib/clickhouse'; +import kafka from '@/lib/kafka'; +import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db'; + +export interface CreateIdentityLinkArgs { + websiteId: string; + visitorId: string; + distinctId: string; +} + +export async function createIdentityLink(data: CreateIdentityLinkArgs) { + return runQuery({ + [PRISMA]: () => relationalQuery(data), + [CLICKHOUSE]: () => clickhouseQuery(data), + }); +} + +async function relationalQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) { + const { client } = prisma; + + return client.identityLink.upsert({ + where: { + websiteId_visitorId_distinctId: { + websiteId, + visitorId, + distinctId, + }, + }, + update: { + linkedAt: new Date(), + }, + create: { + id: uuid(), + websiteId, + visitorId, + distinctId, + }, + }); +} + +async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) { + const { insert, getUTCString } = clickhouse; + const { sendMessage } = kafka; + + const message = { + website_id: websiteId, + visitor_id: visitorId, + distinct_id: distinctId, + linked_at: getUTCString(new Date()), + }; + + if (kafka.enabled) { + await sendMessage('identity_link', message); + } else { + await insert('identity_link', [message]); + } +} diff --git a/src/queries/sql/identity/getLinkedVisitorIds.ts b/src/queries/sql/identity/getLinkedVisitorIds.ts new file mode 100644 index 00000000..2f1758a4 --- /dev/null +++ b/src/queries/sql/identity/getLinkedVisitorIds.ts @@ -0,0 +1,61 @@ +import prisma from '@/lib/prisma'; +import clickhouse from '@/lib/clickhouse'; +import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db'; + +export interface GetLinkedVisitorIdsArgs { + websiteId: string; + distinctId: string; +} + +export interface LinkedVisitorId { + visitorId: string; + linkedAt: Date; +} + +export async function getLinkedVisitorIds( + data: GetLinkedVisitorIdsArgs, +): Promise { + return runQuery({ + [PRISMA]: () => relationalQuery(data), + [CLICKHOUSE]: () => clickhouseQuery(data), + }); +} + +async function relationalQuery({ + websiteId, + distinctId, +}: GetLinkedVisitorIdsArgs): Promise { + const { client } = prisma; + + const links = await client.identityLink.findMany({ + where: { + websiteId, + distinctId, + }, + select: { + visitorId: true, + linkedAt: true, + }, + }); + + return links; +} + +async function clickhouseQuery({ + websiteId, + distinctId, +}: GetLinkedVisitorIdsArgs): Promise { + const { rawQuery } = clickhouse; + + return rawQuery( + ` + select + visitor_id as visitorId, + linked_at as linkedAt + from identity_link final + where website_id = {websiteId:UUID} + and distinct_id = {distinctId:String} + `, + { websiteId, distinctId }, + ); +} diff --git a/src/queries/sql/identity/index.ts b/src/queries/sql/identity/index.ts new file mode 100644 index 00000000..cd81b3b9 --- /dev/null +++ b/src/queries/sql/identity/index.ts @@ -0,0 +1,2 @@ +export * from './createIdentityLink'; +export * from './getLinkedVisitorIds'; diff --git a/src/queries/sql/index.ts b/src/queries/sql/index.ts index 682ac6d2..5ac66973 100644 --- a/src/queries/sql/index.ts +++ b/src/queries/sql/index.ts @@ -39,3 +39,5 @@ export * from './getValues'; export * from './getWebsiteDateRange'; export * from './getWebsiteStats'; export * from './getWeeklyTraffic'; +export * from './identity/createIdentityLink'; +export * from './identity/getLinkedVisitorIds'; diff --git a/src/queries/sql/sessions/createSession.ts b/src/queries/sql/sessions/createSession.ts index b5106a54..013ca412 100644 --- a/src/queries/sql/sessions/createSession.ts +++ b/src/queries/sql/sessions/createSession.ts @@ -20,6 +20,7 @@ export async function createSession(data: Prisma.SessionCreateInput) { region, city, distinct_id, + visitor_id, created_at ) values ( @@ -34,6 +35,7 @@ export async function createSession(data: Prisma.SessionCreateInput) { {{region}}, {{city}}, {{distinctId}}, + {{visitorId}}, {{createdAt}} ) on conflict (session_id) do nothing diff --git a/src/tracker/index.js b/src/tracker/index.js index a9966198..94a1776c 100644 --- a/src/tracker/index.js +++ b/src/tracker/index.js @@ -29,6 +29,7 @@ const excludeHash = attr(_data + 'exclude-hash') === _true; const domain = attr(_data + 'domains') || ''; const credentials = attr(_data + 'fetch-credentials') || 'omit'; + const identityStitching = attr(_data + 'identity-stitching') !== _false; const domains = domain.split(',').map(n => n.trim()); const host = @@ -41,6 +42,28 @@ /* Helper functions */ + const generateUUID = () => + 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => { + const r = (Math.random() * 16) | 0; + return (c === 'x' ? r : (r & 0x3) | 0x8).toString(16); + }); + + const getVisitorId = () => { + if (!identityStitching || !localStorage) return undefined; + + const storageKey = 'umami.visitor'; + let vid = localStorage.getItem(storageKey); + + if (!vid) { + vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID(); + localStorage.setItem(storageKey, vid); + } + + return vid; + }; + + const visitorId = getVisitorId(); + const normalize = raw => { if (!raw) return raw; try { @@ -63,6 +86,7 @@ referrer: currentRef, tag, id: identity ? identity : undefined, + vid: visitorId, }); const hasDoNotTrack = () => { From 34db34759f131c3c861112cda38f9e484b6b3157 Mon Sep 17 00:00:00 2001 From: Arthur Sepiol Date: Wed, 3 Dec 2025 16:54:56 +0300 Subject: [PATCH 2/3] feat: implement automatic session linking and identity stitching (#3820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Links anonymous browser sessions to authenticated user identities, enabling unified user journey tracking across login boundaries. This solves the "logged-out anonymous session → logged-in session" tracking gap, providing complete funnel visibility and accurate visitor deduplication. ## Changes - Client-side: Persistent visitor ID in localStorage (data-identity-stitching attribute) - Server-side: identity_link table linking visitors to distinct IDs (authenticated users) - Query updates: getWebsiteStats now deduplicates by resolved identity - Graceful degradation: Works in Safari private browsing and when localStorage unavailable ## Implementation Details Uses hybrid approach combining client-side persistence with server-side linking: - Visitor ID generated once per browser, persists across sessions - When user logs in, identify() creates identity link - stats queries join through identity_link to deduplicate cross-device sessions Both PostgreSQL and ClickHouse supported with appropriate query patterns: - PostgreSQL: normalized schema, joins through session table - ClickHouse: denormalized with ReplacingMergeTree for deduplication ## Edge Cases Handled - Safari private browsing: localStorage throws, visitorId undefined, no link created - localStorage cleared: new visitorId generated, creates new link - Multiple tabs: same visitorId shared via localStorage - Multiple devices: one visitor can link to multiple distinct_ids - Multiple accounts: one distinct_id can link to multiple visitors ## Test Plan - [ ] Enable feature on test website (default enabled) - [ ] Anonymous pageview - confirm visitor_id in events table - [ ] Call umami.identify('user1') - confirm identity_link created - [ ] Stats show 1 visitor (deduplicated) - [ ] Log out, browse anonymously, stats still show 1 visitor - [ ] Test with data-identity-stitching="false" - no visitor_id collected - [ ] Test in Safari private browsing - no errors, gracefully skips - [ ] Test ClickHouse: verify identity_link table populated and FINAL keyword works - [ ] Verify retroactive: historical anonymous session attributed correctly --- db/clickhouse/schema.sql | 1 + prisma/schema.prisma | 11 ++--- src/app/api/send/route.ts | 6 ++- src/queries/sql/getWebsiteStats.ts | 8 ++-- .../sql/identity/createIdentityLink.ts | 19 +++++++- .../sql/identity/getLinkedVisitorIds.ts | 10 +++++ src/tracker/index.js | 45 +++++++++++++------ 7 files changed, 76 insertions(+), 24 deletions(-) diff --git a/db/clickhouse/schema.sql b/db/clickhouse/schema.sql index 922cfe67..625ee5a3 100644 --- a/db/clickhouse/schema.sql +++ b/db/clickhouse/schema.sql @@ -293,6 +293,7 @@ CREATE TABLE umami.identity_link website_id UUID, visitor_id String, distinct_id String, + created_at DateTime('UTC'), linked_at DateTime('UTC') ) ENGINE = ReplacingMergeTree(linked_at) diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 471963eb..90bfdb52 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -321,11 +321,12 @@ model Pixel { } model IdentityLink { - id String @id @unique @map("identity_link_id") @db.Uuid - websiteId String @map("website_id") @db.Uuid - visitorId String @map("visitor_id") @db.VarChar(50) - distinctId String @map("distinct_id") @db.VarChar(50) - linkedAt DateTime @default(now()) @map("linked_at") @db.Timestamptz(6) + id String @id @unique @map("identity_link_id") @db.Uuid + websiteId String @map("website_id") @db.Uuid + visitorId String @map("visitor_id") @db.VarChar(50) + distinctId String @map("distinct_id") @db.VarChar(50) + createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(6) + linkedAt DateTime @default(now()) @updatedAt @map("linked_at") @db.Timestamptz(6) website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade) diff --git a/src/app/api/send/route.ts b/src/app/api/send/route.ts index bc7eb4a8..62776c6a 100644 --- a/src/app/api/send/route.ts +++ b/src/app/api/send/route.ts @@ -271,11 +271,15 @@ export async function POST(request: Request) { } // Create identity link when both visitorId and distinctId are present + // Fire-and-forget to avoid adding latency to the tracking endpoint if (visitorId && id && websiteId) { - await createIdentityLink({ + createIdentityLink({ websiteId, visitorId, distinctId: id, + }).catch(e => { + // eslint-disable-next-line no-console + console.error('Failed to create identity link:', e); }); } } diff --git a/src/queries/sql/getWebsiteStats.ts b/src/queries/sql/getWebsiteStats.ts index 42571d14..5fb5e625 100644 --- a/src/queries/sql/getWebsiteStats.ts +++ b/src/queries/sql/getWebsiteStats.ts @@ -84,7 +84,7 @@ async function clickhouseQuery( sql = ` select sum(t.c) as "pageviews", - uniq(coalesce(t.resolved_identity, t.session_id)) as "visitors", + uniq(coalesce(t.resolved_identity, toString(t.session_id))) as "visitors", uniq(t.visit_id) as "visits", sum(if(t.c = 1, 1, 0)) as "bounces", sum(max_time-min_time) as "totaltime" @@ -98,7 +98,7 @@ async function clickhouseQuery( max(we.created_at) max_time from website_event we ${cohortQuery} - left join identity_link il on il.visitor_id = we.visitor_id + left join identity_link final il on il.visitor_id = we.visitor_id and il.website_id = we.website_id where we.website_id = {websiteId:UUID} and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} @@ -111,7 +111,7 @@ async function clickhouseQuery( sql = ` select sum(t.c) as "pageviews", - uniq(coalesce(resolved_identity, session_id)) as "visitors", + uniq(coalesce(resolved_identity, toString(session_id))) as "visitors", uniq(visit_id) as "visits", sumIf(1, t.c = 1) as "bounces", sum(max_time-min_time) as "totaltime" @@ -124,7 +124,7 @@ async function clickhouseQuery( max(we.max_time) max_time from website_event_stats_hourly we ${cohortQuery} - left join identity_link il on il.visitor_id = we.visitor_id + left join identity_link final il on il.visitor_id = we.visitor_id and il.website_id = we.website_id where we.website_id = {websiteId:UUID} and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} diff --git a/src/queries/sql/identity/createIdentityLink.ts b/src/queries/sql/identity/createIdentityLink.ts index 29a21a88..61c49cbf 100644 --- a/src/queries/sql/identity/createIdentityLink.ts +++ b/src/queries/sql/identity/createIdentityLink.ts @@ -1,3 +1,18 @@ +/** + * Identity Stitching - Links anonymous browser sessions to authenticated user identities + * + * Design decisions: + * - One visitor can link to multiple distinct_ids (user logs into different accounts) + * - One distinct_id can link to multiple visitors (user on multiple devices/browsers) + * - Links are additive and never invalidated (preserves historical journey) + * - Uses ReplacingMergeTree in ClickHouse with linked_at for deduplication + * - Upsert pattern ensures idempotency for repeated identify() calls + * + * Edge cases handled: + * - Safari private browsing: visitorId will be undefined, no link created + * - localStorage cleared: new visitorId generated, creates new link + * - Multiple tabs: same visitorId shared via localStorage + */ import { uuid } from '@/lib/crypto'; import prisma from '@/lib/prisma'; import clickhouse from '@/lib/clickhouse'; @@ -44,11 +59,13 @@ async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdent const { insert, getUTCString } = clickhouse; const { sendMessage } = kafka; + const now = getUTCString(new Date()); const message = { website_id: websiteId, visitor_id: visitorId, distinct_id: distinctId, - linked_at: getUTCString(new Date()), + created_at: now, + linked_at: now, }; if (kafka.enabled) { diff --git a/src/queries/sql/identity/getLinkedVisitorIds.ts b/src/queries/sql/identity/getLinkedVisitorIds.ts index 2f1758a4..a328ff82 100644 --- a/src/queries/sql/identity/getLinkedVisitorIds.ts +++ b/src/queries/sql/identity/getLinkedVisitorIds.ts @@ -1,3 +1,13 @@ +/** + * Resolves all visitor IDs linked to a given distinct_id (authenticated user) + * + * Use cases (for future implementation): + * - User journey reports: aggregate sessions across devices + * - Cohort analysis: include all linked sessions + * - Retroactive attribution: credit conversions to original anonymous session + * + * Note: Uses FINAL keyword in ClickHouse to ensure deduplication from ReplacingMergeTree + */ import prisma from '@/lib/prisma'; import clickhouse from '@/lib/clickhouse'; import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db'; diff --git a/src/tracker/index.js b/src/tracker/index.js index 94a1776c..f60cf7c2 100644 --- a/src/tracker/index.js +++ b/src/tracker/index.js @@ -42,6 +42,11 @@ /* Helper functions */ + /** + * Identity Stitching: Generates a persistent visitor ID stored in localStorage. + * When combined with identify(), links anonymous sessions to authenticated users. + * Gracefully degrades when localStorage is unavailable (Safari private browsing). + */ const generateUUID = () => 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => { const r = (Math.random() * 16) | 0; @@ -51,15 +56,20 @@ const getVisitorId = () => { if (!identityStitching || !localStorage) return undefined; - const storageKey = 'umami.visitor'; - let vid = localStorage.getItem(storageKey); + try { + const storageKey = 'umami.visitor'; + let vid = localStorage.getItem(storageKey); - if (!vid) { - vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID(); - localStorage.setItem(storageKey, vid); + if (!vid) { + vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID(); + localStorage.setItem(storageKey, vid); + } + + return vid; + } catch { + // localStorage access throws in Safari private browsing + return undefined; } - - return vid; }; const visitorId = getVisitorId(); @@ -165,12 +175,21 @@ /* Tracking functions */ - const trackingDisabled = () => - disabled || - !website || - (localStorage && localStorage.getItem('umami.disabled')) || - (domain && !domains.includes(hostname)) || - (dnt && hasDoNotTrack()); + const trackingDisabled = () => { + let storageDisabled = false; + try { + storageDisabled = localStorage && localStorage.getItem('umami.disabled'); + } catch { + // localStorage throws in Safari private browsing + } + return ( + disabled || + !website || + storageDisabled || + (domain && !domains.includes(hostname)) || + (dnt && hasDoNotTrack()) + ); + }; const send = async (payload, type = 'event') => { if (trackingDisabled()) return; From ecf0b7bef455e50c90e0f320b02766f9acb70bb7 Mon Sep 17 00:00:00 2001 From: Arthur Sepiol Date: Wed, 3 Dec 2025 17:02:29 +0300 Subject: [PATCH 3/3] fix: add visitor_id to coalesce chain for accurate visitor deduplication Fixes visitor inflation bug where same person was counted twice: - Once as session_id (before identify) - Once as distinct_id (after identify) The coalesce chain now uses visitor_id as fallback before session_id, ensuring consistent counting across the identify() boundary. --- src/queries/sql/getWebsiteStats.ts | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/queries/sql/getWebsiteStats.ts b/src/queries/sql/getWebsiteStats.ts index 5fb5e625..e6d0ef47 100644 --- a/src/queries/sql/getWebsiteStats.ts +++ b/src/queries/sql/getWebsiteStats.ts @@ -37,7 +37,7 @@ async function relationalQuery( ` select cast(coalesce(sum(t.c), 0) as bigint) as "pageviews", - count(distinct coalesce(t.resolved_identity, t.session_id::text)) as "visitors", + count(distinct coalesce(t.resolved_identity, t.visitor_id, t.session_id::text)) as "visitors", count(distinct t.visit_id) as "visits", coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces", cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime" @@ -45,6 +45,7 @@ async function relationalQuery( select website_event.session_id, website_event.visit_id, + session.visitor_id, il.distinct_id as "resolved_identity", count(*) as "c", min(website_event.created_at) as "min_time", @@ -60,7 +61,7 @@ async function relationalQuery( and website_event.created_at between {{startDate}} and {{endDate}} and website_event.event_type != 2 ${filterQuery} - group by 1, 2, 3 + group by 1, 2, 3, 4 ) as t `, queryParams, @@ -84,7 +85,7 @@ async function clickhouseQuery( sql = ` select sum(t.c) as "pageviews", - uniq(coalesce(t.resolved_identity, toString(t.session_id))) as "visitors", + uniq(coalesce(t.resolved_identity, t.visitor_id, toString(t.session_id))) as "visitors", uniq(t.visit_id) as "visits", sum(if(t.c = 1, 1, 0)) as "bounces", sum(max_time-min_time) as "totaltime" @@ -92,6 +93,7 @@ async function clickhouseQuery( select we.session_id, we.visit_id, + we.visitor_id, il.distinct_id as resolved_identity, count(*) c, min(we.created_at) min_time, @@ -104,20 +106,21 @@ async function clickhouseQuery( and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} and we.event_type != 2 ${filterQuery} - group by we.session_id, we.visit_id, il.distinct_id + group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id ) as t; `; } else { sql = ` select sum(t.c) as "pageviews", - uniq(coalesce(resolved_identity, toString(session_id))) as "visitors", + uniq(coalesce(resolved_identity, visitor_id, toString(session_id))) as "visitors", uniq(visit_id) as "visits", sumIf(1, t.c = 1) as "bounces", sum(max_time-min_time) as "totaltime" from (select we.session_id, we.visit_id, + we.visitor_id, il.distinct_id as resolved_identity, sum(we.views) c, min(we.min_time) min_time, @@ -130,7 +133,7 @@ async function clickhouseQuery( and we.created_at between {startDate:DateTime64} and {endDate:DateTime64} and we.event_type != 2 ${filterQuery} - group by we.session_id, we.visit_id, il.distinct_id + group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id ) as t; `; }