This commit is contained in:
Lokimorty 2025-12-04 14:35:00 +08:00 committed by GitHub
commit 0e0a0795b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 306 additions and 39 deletions

View file

@ -39,6 +39,7 @@ CREATE TABLE umami.website_event
event_name String, event_name String,
tag String, tag String,
distinct_id String, distinct_id String,
visitor_id String,
created_at DateTime('UTC'), created_at DateTime('UTC'),
job_id Nullable(UUID) job_id Nullable(UUID)
) )
@ -123,6 +124,7 @@ CREATE TABLE umami.website_event_stats_hourly
max_time SimpleAggregateFunction(max, DateTime('UTC')), max_time SimpleAggregateFunction(max, DateTime('UTC')),
tag SimpleAggregateFunction(groupArrayArray, Array(String)), tag SimpleAggregateFunction(groupArrayArray, Array(String)),
distinct_id String, distinct_id String,
visitor_id String,
created_at Datetime('UTC') created_at Datetime('UTC')
) )
ENGINE = AggregatingMergeTree ENGINE = AggregatingMergeTree
@ -176,6 +178,7 @@ SELECT
max_time, max_time,
tag, tag,
distinct_id, distinct_id,
visitor_id,
timestamp as created_at timestamp as created_at
FROM (SELECT FROM (SELECT
website_id, website_id,
@ -214,6 +217,7 @@ FROM (SELECT
max(created_at) max_time, max(created_at) max_time,
arrayFilter(x -> x != '', groupArray(tag)) tag, arrayFilter(x -> x != '', groupArray(tag)) tag,
distinct_id, distinct_id,
visitor_id,
toStartOfHour(created_at) timestamp toStartOfHour(created_at) timestamp
FROM umami.website_event FROM umami.website_event
GROUP BY website_id, GROUP BY website_id,
@ -230,6 +234,7 @@ GROUP BY website_id,
city, city,
event_type, event_type,
distinct_id, distinct_id,
visitor_id,
timestamp); timestamp);
-- projections -- projections
@ -281,3 +286,16 @@ JOIN (SELECT event_id, string_value as currency
WHERE positionCaseInsensitive(data_key, 'currency') > 0) c WHERE positionCaseInsensitive(data_key, 'currency') > 0) c
ON c.event_id = ed.event_id ON c.event_id = ed.event_id
WHERE positionCaseInsensitive(data_key, 'revenue') > 0; WHERE positionCaseInsensitive(data_key, 'revenue') > 0;
-- identity linking
CREATE TABLE umami.identity_link
(
website_id UUID,
visitor_id String,
distinct_id String,
created_at DateTime('UTC'),
linked_at DateTime('UTC')
)
ENGINE = ReplacingMergeTree(linked_at)
ORDER BY (website_id, visitor_id, distinct_id)
SETTINGS index_granularity = 8192;

View file

@ -43,6 +43,7 @@ model Session {
region String? @db.VarChar(20) region String? @db.VarChar(20)
city String? @db.VarChar(50) city String? @db.VarChar(50)
distinctId String? @map("distinct_id") @db.VarChar(50) distinctId String? @map("distinct_id") @db.VarChar(50)
visitorId String? @map("visitor_id") @db.VarChar(50)
createdAt DateTime? @default(now()) @map("created_at") @db.Timestamptz(6) createdAt DateTime? @default(now()) @map("created_at") @db.Timestamptz(6)
websiteEvents WebsiteEvent[] websiteEvents WebsiteEvent[]
@ -60,6 +61,7 @@ model Session {
@@index([websiteId, createdAt, country]) @@index([websiteId, createdAt, country])
@@index([websiteId, createdAt, region]) @@index([websiteId, createdAt, region])
@@index([websiteId, createdAt, city]) @@index([websiteId, createdAt, city])
@@index([websiteId, visitorId])
@@map("session") @@map("session")
} }
@ -84,6 +86,7 @@ model Website {
revenue Revenue[] revenue Revenue[]
segments Segment[] segments Segment[]
sessionData SessionData[] sessionData SessionData[]
identityLinks IdentityLink[]
@@index([userId]) @@index([userId])
@@index([teamId]) @@index([teamId])
@ -316,3 +319,19 @@ model Pixel {
@@index([createdAt]) @@index([createdAt])
@@map("pixel") @@map("pixel")
} }
model IdentityLink {
id String @id @unique @map("identity_link_id") @db.Uuid
websiteId String @map("website_id") @db.Uuid
visitorId String @map("visitor_id") @db.VarChar(50)
distinctId String @map("distinct_id") @db.VarChar(50)
createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(6)
linkedAt DateTime @default(now()) @updatedAt @map("linked_at") @db.Timestamptz(6)
website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade)
@@unique([websiteId, visitorId, distinctId])
@@index([websiteId, distinctId])
@@index([websiteId, visitorId])
@@map("identity_link")
}

View file

@ -11,7 +11,7 @@ import { secret, uuid, hash } from '@/lib/crypto';
import { COLLECTION_TYPE, EVENT_TYPE } from '@/lib/constants'; import { COLLECTION_TYPE, EVENT_TYPE } from '@/lib/constants';
import { anyObjectParam, urlOrPathParam } from '@/lib/schema'; import { anyObjectParam, urlOrPathParam } from '@/lib/schema';
import { safeDecodeURI, safeDecodeURIComponent } from '@/lib/url'; import { safeDecodeURI, safeDecodeURIComponent } from '@/lib/url';
import { createSession, saveEvent, saveSessionData } from '@/queries/sql'; import { createSession, saveEvent, saveSessionData, createIdentityLink } from '@/queries/sql';
import { serializeError } from 'serialize-error'; import { serializeError } from 'serialize-error';
interface Cache { interface Cache {
@ -41,6 +41,7 @@ const schema = z.object({
userAgent: z.string().optional(), userAgent: z.string().optional(),
timestamp: z.coerce.number().int().optional(), timestamp: z.coerce.number().int().optional(),
id: z.string().optional(), id: z.string().optional(),
vid: z.string().max(50).optional(),
}) })
.refine( .refine(
data => { data => {
@ -80,6 +81,7 @@ export async function POST(request: Request) {
tag, tag,
timestamp, timestamp,
id, id,
vid: visitorId,
} = payload; } = payload;
const sourceId = websiteId || pixelId || linkId; const sourceId = websiteId || pixelId || linkId;
@ -146,6 +148,7 @@ export async function POST(request: Request) {
region, region,
city, city,
distinctId: id, distinctId: id,
visitorId,
createdAt, createdAt,
}); });
} }
@ -226,6 +229,7 @@ export async function POST(request: Request) {
// Session // Session
distinctId: id, distinctId: id,
visitorId,
browser, browser,
os, os,
device, device,
@ -265,6 +269,19 @@ export async function POST(request: Request) {
createdAt, createdAt,
}); });
} }
// Create identity link when both visitorId and distinctId are present
// Fire-and-forget to avoid adding latency to the tracking endpoint
if (visitorId && id && websiteId) {
createIdentityLink({
websiteId,
visitorId,
distinctId: id,
}).catch(e => {
// eslint-disable-next-line no-console
console.error('Failed to create identity link:', e);
});
}
} }
const token = createToken({ websiteId, sessionId, visitId, iat }, secret()); const token = createToken({ websiteId, sessionId, visitId, iat }, secret());

View file

@ -25,6 +25,7 @@ export interface SaveEventArgs {
// Session // Session
distinctId?: string; distinctId?: string;
visitorId?: string;
browser?: string; browser?: string;
os?: string; os?: string;
device?: string; device?: string;
@ -164,6 +165,7 @@ async function clickhouseQuery({
referrerQuery, referrerQuery,
referrerDomain, referrerDomain,
distinctId, distinctId,
visitorId,
browser, browser,
os, os,
device, device,
@ -220,6 +222,7 @@ async function clickhouseQuery({
event_name: eventName ? eventName?.substring(0, EVENT_NAME_LENGTH) : null, event_name: eventName ? eventName?.substring(0, EVENT_NAME_LENGTH) : null,
tag: tag, tag: tag,
distinct_id: distinctId, distinct_id: distinctId,
visitor_id: visitorId,
created_at: getUTCString(createdAt), created_at: getUTCString(createdAt),
browser, browser,
os, os,

View file

@ -37,7 +37,7 @@ async function relationalQuery(
` `
select select
cast(coalesce(sum(t.c), 0) as bigint) as "pageviews", cast(coalesce(sum(t.c), 0) as bigint) as "pageviews",
count(distinct t.session_id) as "visitors", count(distinct coalesce(t.resolved_identity, t.visitor_id, t.session_id::text)) as "visitors",
count(distinct t.visit_id) as "visits", count(distinct t.visit_id) as "visits",
coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces", coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces",
cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime" cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime"
@ -45,17 +45,23 @@ async function relationalQuery(
select select
website_event.session_id, website_event.session_id,
website_event.visit_id, website_event.visit_id,
session.visitor_id,
il.distinct_id as "resolved_identity",
count(*) as "c", count(*) as "c",
min(website_event.created_at) as "min_time", min(website_event.created_at) as "min_time",
max(website_event.created_at) as "max_time" max(website_event.created_at) as "max_time"
from website_event from website_event
${cohortQuery} ${cohortQuery}
${joinSessionQuery} ${joinSessionQuery}
left join session on session.session_id = website_event.session_id
and session.website_id = website_event.website_id
left join identity_link il on il.visitor_id = session.visitor_id
and il.website_id = session.website_id
where website_event.website_id = {{websiteId::uuid}} where website_event.website_id = {{websiteId::uuid}}
and website_event.created_at between {{startDate}} and {{endDate}} and website_event.created_at between {{startDate}} and {{endDate}}
and website_event.event_type != 2 and website_event.event_type != 2
${filterQuery} ${filterQuery}
group by 1, 2 group by 1, 2, 3, 4
) as t ) as t
`, `,
queryParams, queryParams,
@ -79,47 +85,55 @@ async function clickhouseQuery(
sql = ` sql = `
select select
sum(t.c) as "pageviews", sum(t.c) as "pageviews",
uniq(t.session_id) as "visitors", uniq(coalesce(t.resolved_identity, t.visitor_id, toString(t.session_id))) as "visitors",
uniq(t.visit_id) as "visits", uniq(t.visit_id) as "visits",
sum(if(t.c = 1, 1, 0)) as "bounces", sum(if(t.c = 1, 1, 0)) as "bounces",
sum(max_time-min_time) as "totaltime" sum(max_time-min_time) as "totaltime"
from ( from (
select select
session_id, we.session_id,
visit_id, we.visit_id,
we.visitor_id,
il.distinct_id as resolved_identity,
count(*) c, count(*) c,
min(created_at) min_time, min(we.created_at) min_time,
max(created_at) max_time max(we.created_at) max_time
from website_event from website_event we
${cohortQuery} ${cohortQuery}
where website_id = {websiteId:UUID} left join identity_link final il on il.visitor_id = we.visitor_id
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and il.website_id = we.website_id
and event_type != 2 where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
and we.event_type != 2
${filterQuery} ${filterQuery}
group by session_id, visit_id group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id
) as t; ) as t;
`; `;
} else { } else {
sql = ` sql = `
select select
sum(t.c) as "pageviews", sum(t.c) as "pageviews",
uniq(session_id) as "visitors", uniq(coalesce(resolved_identity, visitor_id, toString(session_id))) as "visitors",
uniq(visit_id) as "visits", uniq(visit_id) as "visits",
sumIf(1, t.c = 1) as "bounces", sumIf(1, t.c = 1) as "bounces",
sum(max_time-min_time) as "totaltime" sum(max_time-min_time) as "totaltime"
from (select from (select
session_id, we.session_id,
visit_id, we.visit_id,
sum(views) c, we.visitor_id,
min(min_time) min_time, il.distinct_id as resolved_identity,
max(max_time) max_time sum(we.views) c,
from website_event_stats_hourly "website_event" min(we.min_time) min_time,
max(we.max_time) max_time
from website_event_stats_hourly we
${cohortQuery} ${cohortQuery}
where website_id = {websiteId:UUID} left join identity_link final il on il.visitor_id = we.visitor_id
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and il.website_id = we.website_id
and event_type != 2 where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
and we.event_type != 2
${filterQuery} ${filterQuery}
group by session_id, visit_id group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id
) as t; ) as t;
`; `;
} }

View file

@ -0,0 +1,76 @@
/**
* Identity Stitching - Links anonymous browser sessions to authenticated user identities
*
* Design decisions:
* - One visitor can link to multiple distinct_ids (user logs into different accounts)
* - One distinct_id can link to multiple visitors (user on multiple devices/browsers)
* - Links are additive and never invalidated (preserves historical journey)
* - Uses ReplacingMergeTree in ClickHouse with linked_at for deduplication
* - Upsert pattern ensures idempotency for repeated identify() calls
*
* Edge cases handled:
* - Safari private browsing: visitorId will be undefined, no link created
* - localStorage cleared: new visitorId generated, creates new link
* - Multiple tabs: same visitorId shared via localStorage
*/
import { uuid } from '@/lib/crypto';
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import kafka from '@/lib/kafka';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';
export interface CreateIdentityLinkArgs {
websiteId: string;
visitorId: string;
distinctId: string;
}
export async function createIdentityLink(data: CreateIdentityLinkArgs) {
return runQuery({
[PRISMA]: () => relationalQuery(data),
[CLICKHOUSE]: () => clickhouseQuery(data),
});
}
async function relationalQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) {
const { client } = prisma;
return client.identityLink.upsert({
where: {
websiteId_visitorId_distinctId: {
websiteId,
visitorId,
distinctId,
},
},
update: {
linkedAt: new Date(),
},
create: {
id: uuid(),
websiteId,
visitorId,
distinctId,
},
});
}
async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) {
const { insert, getUTCString } = clickhouse;
const { sendMessage } = kafka;
const now = getUTCString(new Date());
const message = {
website_id: websiteId,
visitor_id: visitorId,
distinct_id: distinctId,
created_at: now,
linked_at: now,
};
if (kafka.enabled) {
await sendMessage('identity_link', message);
} else {
await insert('identity_link', [message]);
}
}

View file

@ -0,0 +1,71 @@
/**
* Resolves all visitor IDs linked to a given distinct_id (authenticated user)
*
* Use cases (for future implementation):
* - User journey reports: aggregate sessions across devices
* - Cohort analysis: include all linked sessions
* - Retroactive attribution: credit conversions to original anonymous session
*
* Note: Uses FINAL keyword in ClickHouse to ensure deduplication from ReplacingMergeTree
*/
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';
export interface GetLinkedVisitorIdsArgs {
websiteId: string;
distinctId: string;
}
export interface LinkedVisitorId {
visitorId: string;
linkedAt: Date;
}
export async function getLinkedVisitorIds(
data: GetLinkedVisitorIdsArgs,
): Promise<LinkedVisitorId[]> {
return runQuery({
[PRISMA]: () => relationalQuery(data),
[CLICKHOUSE]: () => clickhouseQuery(data),
});
}
async function relationalQuery({
websiteId,
distinctId,
}: GetLinkedVisitorIdsArgs): Promise<LinkedVisitorId[]> {
const { client } = prisma;
const links = await client.identityLink.findMany({
where: {
websiteId,
distinctId,
},
select: {
visitorId: true,
linkedAt: true,
},
});
return links;
}
async function clickhouseQuery({
websiteId,
distinctId,
}: GetLinkedVisitorIdsArgs): Promise<LinkedVisitorId[]> {
const { rawQuery } = clickhouse;
return rawQuery<LinkedVisitorId[]>(
`
select
visitor_id as visitorId,
linked_at as linkedAt
from identity_link final
where website_id = {websiteId:UUID}
and distinct_id = {distinctId:String}
`,
{ websiteId, distinctId },
);
}

View file

@ -0,0 +1,2 @@
export * from './createIdentityLink';
export * from './getLinkedVisitorIds';

View file

@ -39,3 +39,5 @@ export * from './getValues';
export * from './getWebsiteDateRange'; export * from './getWebsiteDateRange';
export * from './getWebsiteStats'; export * from './getWebsiteStats';
export * from './getWeeklyTraffic'; export * from './getWeeklyTraffic';
export * from './identity/createIdentityLink';
export * from './identity/getLinkedVisitorIds';

View file

@ -20,6 +20,7 @@ export async function createSession(data: Prisma.SessionCreateInput) {
region, region,
city, city,
distinct_id, distinct_id,
visitor_id,
created_at created_at
) )
values ( values (
@ -34,6 +35,7 @@ export async function createSession(data: Prisma.SessionCreateInput) {
{{region}}, {{region}},
{{city}}, {{city}},
{{distinctId}}, {{distinctId}},
{{visitorId}},
{{createdAt}} {{createdAt}}
) )
on conflict (session_id) do nothing on conflict (session_id) do nothing

View file

@ -29,6 +29,7 @@
const excludeHash = attr(_data + 'exclude-hash') === _true; const excludeHash = attr(_data + 'exclude-hash') === _true;
const domain = attr(_data + 'domains') || ''; const domain = attr(_data + 'domains') || '';
const credentials = attr(_data + 'fetch-credentials') || 'omit'; const credentials = attr(_data + 'fetch-credentials') || 'omit';
const identityStitching = attr(_data + 'identity-stitching') !== _false;
const domains = domain.split(',').map(n => n.trim()); const domains = domain.split(',').map(n => n.trim());
const host = const host =
@ -41,6 +42,38 @@
/* Helper functions */ /* Helper functions */
/**
* Identity Stitching: Generates a persistent visitor ID stored in localStorage.
* When combined with identify(), links anonymous sessions to authenticated users.
* Gracefully degrades when localStorage is unavailable (Safari private browsing).
*/
const generateUUID = () =>
'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => {
const r = (Math.random() * 16) | 0;
return (c === 'x' ? r : (r & 0x3) | 0x8).toString(16);
});
const getVisitorId = () => {
if (!identityStitching || !localStorage) return undefined;
try {
const storageKey = 'umami.visitor';
let vid = localStorage.getItem(storageKey);
if (!vid) {
vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID();
localStorage.setItem(storageKey, vid);
}
return vid;
} catch {
// localStorage access throws in Safari private browsing
return undefined;
}
};
const visitorId = getVisitorId();
const normalize = raw => { const normalize = raw => {
if (!raw) return raw; if (!raw) return raw;
try { try {
@ -63,6 +96,7 @@
referrer: currentRef, referrer: currentRef,
tag, tag,
id: identity ? identity : undefined, id: identity ? identity : undefined,
vid: visitorId,
}); });
const hasDoNotTrack = () => { const hasDoNotTrack = () => {
@ -141,12 +175,21 @@
/* Tracking functions */ /* Tracking functions */
const trackingDisabled = () => const trackingDisabled = () => {
let storageDisabled = false;
try {
storageDisabled = localStorage && localStorage.getItem('umami.disabled');
} catch {
// localStorage throws in Safari private browsing
}
return (
disabled || disabled ||
!website || !website ||
(localStorage && localStorage.getItem('umami.disabled')) || storageDisabled ||
(domain && !domains.includes(hostname)) || (domain && !domains.includes(hostname)) ||
(dnt && hasDoNotTrack()); (dnt && hasDoNotTrack())
);
};
const send = async (payload, type = 'event') => { const send = async (payload, type = 'event') => {
if (trackingDisabled()) return; if (trackingDisabled()) return;