feat: implement identity stitching for session linking (#3820)

Adds automatic session linking/identity stitching to link anonymous
browsing sessions with authenticated user sessions.

## Changes

### Database Schema
- Add `identity_link` table (PostgreSQL + ClickHouse) to store mappings
  between visitor IDs and authenticated user IDs
- Add `visitor_id` field to `Session` model
- Add `visitor_id` column to ClickHouse `website_event` table

### Client Tracker
- Generate and persist `visitor_id` in localStorage
- Include `vid` in all tracking payloads
- Support opt-out via `data-identity-stitching="false"` attribute

### API
- Accept `vid` parameter in `/api/send` endpoint
- Auto-create identity links when `identify()` is called with both
  visitor_id and distinct_id
- Store visitor_id in sessions and events

### Query Updates
- Update `getWebsiteStats` to deduplicate visitors by resolved identity
- Visitors who browse anonymously then log in are now counted as one user

## Usage

When a user logs in, call `umami.identify(userId)`. If identity stitching
is enabled (default), the tracker automatically links the anonymous
visitor_id to the authenticated userId. Stats queries then resolve
linked identities to accurately count unique visitors.

Resolves #3820
This commit is contained in:
Arthur Sepiol 2025-12-03 16:06:54 +03:00
parent 9a269ab811
commit a902a87c08
11 changed files with 245 additions and 33 deletions

View file

@ -39,6 +39,7 @@ CREATE TABLE umami.website_event
event_name String,
tag String,
distinct_id String,
visitor_id String,
created_at DateTime('UTC'),
job_id Nullable(UUID)
)
@ -123,6 +124,7 @@ CREATE TABLE umami.website_event_stats_hourly
max_time SimpleAggregateFunction(max, DateTime('UTC')),
tag SimpleAggregateFunction(groupArrayArray, Array(String)),
distinct_id String,
visitor_id String,
created_at Datetime('UTC')
)
ENGINE = AggregatingMergeTree
@ -176,6 +178,7 @@ SELECT
max_time,
tag,
distinct_id,
visitor_id,
timestamp as created_at
FROM (SELECT
website_id,
@ -214,6 +217,7 @@ FROM (SELECT
max(created_at) max_time,
arrayFilter(x -> x != '', groupArray(tag)) tag,
distinct_id,
visitor_id,
toStartOfHour(created_at) timestamp
FROM umami.website_event
GROUP BY website_id,
@ -230,6 +234,7 @@ GROUP BY website_id,
city,
event_type,
distinct_id,
visitor_id,
timestamp);
-- projections
@ -281,3 +286,15 @@ JOIN (SELECT event_id, string_value as currency
WHERE positionCaseInsensitive(data_key, 'currency') > 0) c
ON c.event_id = ed.event_id
WHERE positionCaseInsensitive(data_key, 'revenue') > 0;
-- identity linking
CREATE TABLE umami.identity_link
(
website_id UUID,
visitor_id String,
distinct_id String,
linked_at DateTime('UTC')
)
ENGINE = ReplacingMergeTree(linked_at)
ORDER BY (website_id, visitor_id, distinct_id)
SETTINGS index_granularity = 8192;

View file

@ -43,6 +43,7 @@ model Session {
region String? @db.VarChar(20)
city String? @db.VarChar(50)
distinctId String? @map("distinct_id") @db.VarChar(50)
visitorId String? @map("visitor_id") @db.VarChar(50)
createdAt DateTime? @default(now()) @map("created_at") @db.Timestamptz(6)
websiteEvents WebsiteEvent[]
@ -60,6 +61,7 @@ model Session {
@@index([websiteId, createdAt, country])
@@index([websiteId, createdAt, region])
@@index([websiteId, createdAt, city])
@@index([websiteId, visitorId])
@@map("session")
}
@ -76,14 +78,15 @@ model Website {
updatedAt DateTime? @updatedAt @map("updated_at") @db.Timestamptz(6)
deletedAt DateTime? @map("deleted_at") @db.Timestamptz(6)
user User? @relation("user", fields: [userId], references: [id])
createUser User? @relation("createUser", fields: [createdBy], references: [id])
team Team? @relation(fields: [teamId], references: [id])
eventData EventData[]
reports Report[]
revenue Revenue[]
segments Segment[]
sessionData SessionData[]
user User? @relation("user", fields: [userId], references: [id])
createUser User? @relation("createUser", fields: [createdBy], references: [id])
team Team? @relation(fields: [teamId], references: [id])
eventData EventData[]
reports Report[]
revenue Revenue[]
segments Segment[]
sessionData SessionData[]
identityLinks IdentityLink[]
@@index([userId])
@@index([teamId])
@ -316,3 +319,18 @@ model Pixel {
@@index([createdAt])
@@map("pixel")
}
model IdentityLink {
id String @id @unique @map("identity_link_id") @db.Uuid
websiteId String @map("website_id") @db.Uuid
visitorId String @map("visitor_id") @db.VarChar(50)
distinctId String @map("distinct_id") @db.VarChar(50)
linkedAt DateTime @default(now()) @map("linked_at") @db.Timestamptz(6)
website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade)
@@unique([websiteId, visitorId, distinctId])
@@index([websiteId, distinctId])
@@index([websiteId, visitorId])
@@map("identity_link")
}

View file

@ -11,7 +11,7 @@ import { secret, uuid, hash } from '@/lib/crypto';
import { COLLECTION_TYPE, EVENT_TYPE } from '@/lib/constants';
import { anyObjectParam, urlOrPathParam } from '@/lib/schema';
import { safeDecodeURI, safeDecodeURIComponent } from '@/lib/url';
import { createSession, saveEvent, saveSessionData } from '@/queries/sql';
import { createSession, saveEvent, saveSessionData, createIdentityLink } from '@/queries/sql';
import { serializeError } from 'serialize-error';
interface Cache {
@ -41,6 +41,7 @@ const schema = z.object({
userAgent: z.string().optional(),
timestamp: z.coerce.number().int().optional(),
id: z.string().optional(),
vid: z.string().max(50).optional(),
})
.refine(
data => {
@ -80,6 +81,7 @@ export async function POST(request: Request) {
tag,
timestamp,
id,
vid: visitorId,
} = payload;
const sourceId = websiteId || pixelId || linkId;
@ -146,6 +148,7 @@ export async function POST(request: Request) {
region,
city,
distinctId: id,
visitorId,
createdAt,
});
}
@ -226,6 +229,7 @@ export async function POST(request: Request) {
// Session
distinctId: id,
visitorId,
browser,
os,
device,
@ -265,6 +269,15 @@ export async function POST(request: Request) {
createdAt,
});
}
// Create identity link when both visitorId and distinctId are present
if (visitorId && id && websiteId) {
await createIdentityLink({
websiteId,
visitorId,
distinctId: id,
});
}
}
const token = createToken({ websiteId, sessionId, visitId, iat }, secret());

View file

@ -25,6 +25,7 @@ export interface SaveEventArgs {
// Session
distinctId?: string;
visitorId?: string;
browser?: string;
os?: string;
device?: string;
@ -164,6 +165,7 @@ async function clickhouseQuery({
referrerQuery,
referrerDomain,
distinctId,
visitorId,
browser,
os,
device,
@ -220,6 +222,7 @@ async function clickhouseQuery({
event_name: eventName ? eventName?.substring(0, EVENT_NAME_LENGTH) : null,
tag: tag,
distinct_id: distinctId,
visitor_id: visitorId,
created_at: getUTCString(createdAt),
browser,
os,

View file

@ -37,7 +37,7 @@ async function relationalQuery(
`
select
cast(coalesce(sum(t.c), 0) as bigint) as "pageviews",
count(distinct t.session_id) as "visitors",
count(distinct coalesce(t.resolved_identity, t.session_id::text)) as "visitors",
count(distinct t.visit_id) as "visits",
coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces",
cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime"
@ -45,17 +45,22 @@ async function relationalQuery(
select
website_event.session_id,
website_event.visit_id,
il.distinct_id as "resolved_identity",
count(*) as "c",
min(website_event.created_at) as "min_time",
max(website_event.created_at) as "max_time"
from website_event
${cohortQuery}
${joinSessionQuery}
${joinSessionQuery}
left join session on session.session_id = website_event.session_id
and session.website_id = website_event.website_id
left join identity_link il on il.visitor_id = session.visitor_id
and il.website_id = session.website_id
where website_event.website_id = {{websiteId::uuid}}
and website_event.created_at between {{startDate}} and {{endDate}}
and website_event.event_type != 2
${filterQuery}
group by 1, 2
group by 1, 2, 3
) as t
`,
queryParams,
@ -79,47 +84,53 @@ async function clickhouseQuery(
sql = `
select
sum(t.c) as "pageviews",
uniq(t.session_id) as "visitors",
uniq(coalesce(t.resolved_identity, t.session_id)) as "visitors",
uniq(t.visit_id) as "visits",
sum(if(t.c = 1, 1, 0)) as "bounces",
sum(max_time-min_time) as "totaltime"
from (
select
session_id,
visit_id,
we.session_id,
we.visit_id,
il.distinct_id as resolved_identity,
count(*) c,
min(created_at) min_time,
max(created_at) max_time
from website_event
min(we.created_at) min_time,
max(we.created_at) max_time
from website_event we
${cohortQuery}
where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type != 2
left join identity_link il on il.visitor_id = we.visitor_id
and il.website_id = we.website_id
where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
and we.event_type != 2
${filterQuery}
group by session_id, visit_id
group by we.session_id, we.visit_id, il.distinct_id
) as t;
`;
} else {
sql = `
select
sum(t.c) as "pageviews",
uniq(session_id) as "visitors",
uniq(coalesce(resolved_identity, session_id)) as "visitors",
uniq(visit_id) as "visits",
sumIf(1, t.c = 1) as "bounces",
sum(max_time-min_time) as "totaltime"
from (select
session_id,
visit_id,
sum(views) c,
min(min_time) min_time,
max(max_time) max_time
from website_event_stats_hourly "website_event"
we.session_id,
we.visit_id,
il.distinct_id as resolved_identity,
sum(we.views) c,
min(we.min_time) min_time,
max(we.max_time) max_time
from website_event_stats_hourly we
${cohortQuery}
where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type != 2
left join identity_link il on il.visitor_id = we.visitor_id
and il.website_id = we.website_id
where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
and we.event_type != 2
${filterQuery}
group by session_id, visit_id
group by we.session_id, we.visit_id, il.distinct_id
) as t;
`;
}

View file

@ -0,0 +1,59 @@
import { uuid } from '@/lib/crypto';
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import kafka from '@/lib/kafka';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';
export interface CreateIdentityLinkArgs {
websiteId: string;
visitorId: string;
distinctId: string;
}
export async function createIdentityLink(data: CreateIdentityLinkArgs) {
return runQuery({
[PRISMA]: () => relationalQuery(data),
[CLICKHOUSE]: () => clickhouseQuery(data),
});
}
async function relationalQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) {
const { client } = prisma;
return client.identityLink.upsert({
where: {
websiteId_visitorId_distinctId: {
websiteId,
visitorId,
distinctId,
},
},
update: {
linkedAt: new Date(),
},
create: {
id: uuid(),
websiteId,
visitorId,
distinctId,
},
});
}
async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) {
const { insert, getUTCString } = clickhouse;
const { sendMessage } = kafka;
const message = {
website_id: websiteId,
visitor_id: visitorId,
distinct_id: distinctId,
linked_at: getUTCString(new Date()),
};
if (kafka.enabled) {
await sendMessage('identity_link', message);
} else {
await insert('identity_link', [message]);
}
}

View file

@ -0,0 +1,61 @@
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';
export interface GetLinkedVisitorIdsArgs {
websiteId: string;
distinctId: string;
}
export interface LinkedVisitorId {
visitorId: string;
linkedAt: Date;
}
export async function getLinkedVisitorIds(
data: GetLinkedVisitorIdsArgs,
): Promise<LinkedVisitorId[]> {
return runQuery({
[PRISMA]: () => relationalQuery(data),
[CLICKHOUSE]: () => clickhouseQuery(data),
});
}
async function relationalQuery({
websiteId,
distinctId,
}: GetLinkedVisitorIdsArgs): Promise<LinkedVisitorId[]> {
const { client } = prisma;
const links = await client.identityLink.findMany({
where: {
websiteId,
distinctId,
},
select: {
visitorId: true,
linkedAt: true,
},
});
return links;
}
async function clickhouseQuery({
websiteId,
distinctId,
}: GetLinkedVisitorIdsArgs): Promise<LinkedVisitorId[]> {
const { rawQuery } = clickhouse;
return rawQuery<LinkedVisitorId[]>(
`
select
visitor_id as visitorId,
linked_at as linkedAt
from identity_link final
where website_id = {websiteId:UUID}
and distinct_id = {distinctId:String}
`,
{ websiteId, distinctId },
);
}

View file

@ -0,0 +1,2 @@
export * from './createIdentityLink';
export * from './getLinkedVisitorIds';

View file

@ -39,3 +39,5 @@ export * from './getValues';
export * from './getWebsiteDateRange';
export * from './getWebsiteStats';
export * from './getWeeklyTraffic';
export * from './identity/createIdentityLink';
export * from './identity/getLinkedVisitorIds';

View file

@ -20,6 +20,7 @@ export async function createSession(data: Prisma.SessionCreateInput) {
region,
city,
distinct_id,
visitor_id,
created_at
)
values (
@ -34,6 +35,7 @@ export async function createSession(data: Prisma.SessionCreateInput) {
{{region}},
{{city}},
{{distinctId}},
{{visitorId}},
{{createdAt}}
)
on conflict (session_id) do nothing

View file

@ -29,6 +29,7 @@
const excludeHash = attr(_data + 'exclude-hash') === _true;
const domain = attr(_data + 'domains') || '';
const credentials = attr(_data + 'fetch-credentials') || 'omit';
const identityStitching = attr(_data + 'identity-stitching') !== _false;
const domains = domain.split(',').map(n => n.trim());
const host =
@ -41,6 +42,28 @@
/* Helper functions */
const generateUUID = () =>
'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => {
const r = (Math.random() * 16) | 0;
return (c === 'x' ? r : (r & 0x3) | 0x8).toString(16);
});
const getVisitorId = () => {
if (!identityStitching || !localStorage) return undefined;
const storageKey = 'umami.visitor';
let vid = localStorage.getItem(storageKey);
if (!vid) {
vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID();
localStorage.setItem(storageKey, vid);
}
return vid;
};
const visitorId = getVisitorId();
const normalize = raw => {
if (!raw) return raw;
try {
@ -63,6 +86,7 @@
referrer: currentRef,
tag,
id: identity ? identity : undefined,
vid: visitorId,
});
const hasDoNotTrack = () => {