feat: implement automatic session linking and identity stitching (#3820)

Links anonymous browser sessions to authenticated user identities, enabling unified
user journey tracking across login boundaries. This solves the "logged-out anonymous
session → logged-in session" tracking gap, providing complete funnel visibility and
accurate visitor deduplication.

## Changes

- Client-side: Persistent visitor ID in localStorage (data-identity-stitching attribute)
- Server-side: identity_link table linking visitors to distinct IDs (authenticated users)
- Query updates: getWebsiteStats now deduplicates by resolved identity
- Graceful degradation: Works in Safari private browsing and when localStorage unavailable

## Implementation Details

Uses hybrid approach combining client-side persistence with server-side linking:
- Visitor ID generated once per browser, persists across sessions
- When user logs in, identify() creates identity link
- stats queries join through identity_link to deduplicate cross-device sessions

Both PostgreSQL and ClickHouse supported with appropriate query patterns:
- PostgreSQL: normalized schema, joins through session table
- ClickHouse: denormalized with ReplacingMergeTree for deduplication

## Edge Cases Handled

- Safari private browsing: localStorage throws, visitorId undefined, no link created
- localStorage cleared: new visitorId generated, creates new link
- Multiple tabs: same visitorId shared via localStorage
- Multiple devices: one visitor can link to multiple distinct_ids
- Multiple accounts: one distinct_id can link to multiple visitors

## Test Plan

- [ ] Enable feature on test website (default enabled)
- [ ] Anonymous pageview - confirm visitor_id in events table
- [ ] Call umami.identify('user1') - confirm identity_link created
- [ ] Stats show 1 visitor (deduplicated)
- [ ] Log out, browse anonymously, stats still show 1 visitor
- [ ] Test with data-identity-stitching="false" - no visitor_id collected
- [ ] Test in Safari private browsing - no errors, gracefully skips
- [ ] Test ClickHouse: verify identity_link table populated and FINAL keyword works
- [ ] Verify retroactive: historical anonymous session attributed correctly
This commit is contained in:
Arthur Sepiol 2025-12-03 16:54:56 +03:00
parent a902a87c08
commit 34db34759f
7 changed files with 76 additions and 24 deletions

View file

@ -293,6 +293,7 @@ CREATE TABLE umami.identity_link
website_id UUID,
visitor_id String,
distinct_id String,
created_at DateTime('UTC'),
linked_at DateTime('UTC')
)
ENGINE = ReplacingMergeTree(linked_at)

View file

@ -321,11 +321,12 @@ model Pixel {
}
model IdentityLink {
id String @id @unique @map("identity_link_id") @db.Uuid
websiteId String @map("website_id") @db.Uuid
visitorId String @map("visitor_id") @db.VarChar(50)
distinctId String @map("distinct_id") @db.VarChar(50)
linkedAt DateTime @default(now()) @map("linked_at") @db.Timestamptz(6)
id String @id @unique @map("identity_link_id") @db.Uuid
websiteId String @map("website_id") @db.Uuid
visitorId String @map("visitor_id") @db.VarChar(50)
distinctId String @map("distinct_id") @db.VarChar(50)
createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(6)
linkedAt DateTime @default(now()) @updatedAt @map("linked_at") @db.Timestamptz(6)
website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade)

View file

@ -271,11 +271,15 @@ export async function POST(request: Request) {
}
// Create identity link when both visitorId and distinctId are present
// Fire-and-forget to avoid adding latency to the tracking endpoint
if (visitorId && id && websiteId) {
await createIdentityLink({
createIdentityLink({
websiteId,
visitorId,
distinctId: id,
}).catch(e => {
// eslint-disable-next-line no-console
console.error('Failed to create identity link:', e);
});
}
}

View file

@ -84,7 +84,7 @@ async function clickhouseQuery(
sql = `
select
sum(t.c) as "pageviews",
uniq(coalesce(t.resolved_identity, t.session_id)) as "visitors",
uniq(coalesce(t.resolved_identity, toString(t.session_id))) as "visitors",
uniq(t.visit_id) as "visits",
sum(if(t.c = 1, 1, 0)) as "bounces",
sum(max_time-min_time) as "totaltime"
@ -98,7 +98,7 @@ async function clickhouseQuery(
max(we.created_at) max_time
from website_event we
${cohortQuery}
left join identity_link il on il.visitor_id = we.visitor_id
left join identity_link final il on il.visitor_id = we.visitor_id
and il.website_id = we.website_id
where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
@ -111,7 +111,7 @@ async function clickhouseQuery(
sql = `
select
sum(t.c) as "pageviews",
uniq(coalesce(resolved_identity, session_id)) as "visitors",
uniq(coalesce(resolved_identity, toString(session_id))) as "visitors",
uniq(visit_id) as "visits",
sumIf(1, t.c = 1) as "bounces",
sum(max_time-min_time) as "totaltime"
@ -124,7 +124,7 @@ async function clickhouseQuery(
max(we.max_time) max_time
from website_event_stats_hourly we
${cohortQuery}
left join identity_link il on il.visitor_id = we.visitor_id
left join identity_link final il on il.visitor_id = we.visitor_id
and il.website_id = we.website_id
where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}

View file

@ -1,3 +1,18 @@
/**
* Identity Stitching - Links anonymous browser sessions to authenticated user identities
*
* Design decisions:
* - One visitor can link to multiple distinct_ids (user logs into different accounts)
* - One distinct_id can link to multiple visitors (user on multiple devices/browsers)
* - Links are additive and never invalidated (preserves historical journey)
* - Uses ReplacingMergeTree in ClickHouse with linked_at for deduplication
* - Upsert pattern ensures idempotency for repeated identify() calls
*
* Edge cases handled:
* - Safari private browsing: visitorId will be undefined, no link created
* - localStorage cleared: new visitorId generated, creates new link
* - Multiple tabs: same visitorId shared via localStorage
*/
import { uuid } from '@/lib/crypto';
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
@ -44,11 +59,13 @@ async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdent
const { insert, getUTCString } = clickhouse;
const { sendMessage } = kafka;
const now = getUTCString(new Date());
const message = {
website_id: websiteId,
visitor_id: visitorId,
distinct_id: distinctId,
linked_at: getUTCString(new Date()),
created_at: now,
linked_at: now,
};
if (kafka.enabled) {

View file

@ -1,3 +1,13 @@
/**
* Resolves all visitor IDs linked to a given distinct_id (authenticated user)
*
* Use cases (for future implementation):
* - User journey reports: aggregate sessions across devices
* - Cohort analysis: include all linked sessions
* - Retroactive attribution: credit conversions to original anonymous session
*
* Note: Uses FINAL keyword in ClickHouse to ensure deduplication from ReplacingMergeTree
*/
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';

View file

@ -42,6 +42,11 @@
/* Helper functions */
/**
* Identity Stitching: Generates a persistent visitor ID stored in localStorage.
* When combined with identify(), links anonymous sessions to authenticated users.
* Gracefully degrades when localStorage is unavailable (Safari private browsing).
*/
const generateUUID = () =>
'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => {
const r = (Math.random() * 16) | 0;
@ -51,15 +56,20 @@
const getVisitorId = () => {
if (!identityStitching || !localStorage) return undefined;
const storageKey = 'umami.visitor';
let vid = localStorage.getItem(storageKey);
try {
const storageKey = 'umami.visitor';
let vid = localStorage.getItem(storageKey);
if (!vid) {
vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID();
localStorage.setItem(storageKey, vid);
if (!vid) {
vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID();
localStorage.setItem(storageKey, vid);
}
return vid;
} catch {
// localStorage access throws in Safari private browsing
return undefined;
}
return vid;
};
const visitorId = getVisitorId();
@ -165,12 +175,21 @@
/* Tracking functions */
const trackingDisabled = () =>
disabled ||
!website ||
(localStorage && localStorage.getItem('umami.disabled')) ||
(domain && !domains.includes(hostname)) ||
(dnt && hasDoNotTrack());
const trackingDisabled = () => {
let storageDisabled = false;
try {
storageDisabled = localStorage && localStorage.getItem('umami.disabled');
} catch {
// localStorage throws in Safari private browsing
}
return (
disabled ||
!website ||
storageDisabled ||
(domain && !domains.includes(hostname)) ||
(dnt && hasDoNotTrack())
);
};
const send = async (payload, type = 'event') => {
if (trackingDisabled()) return;