This commit is contained in:
Lokimorty 2025-12-04 14:35:00 +08:00 committed by GitHub
commit 0e0a0795b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 306 additions and 39 deletions

View file

@ -39,6 +39,7 @@ CREATE TABLE umami.website_event
event_name String,
tag String,
distinct_id String,
visitor_id String,
created_at DateTime('UTC'),
job_id Nullable(UUID)
)
@ -123,6 +124,7 @@ CREATE TABLE umami.website_event_stats_hourly
max_time SimpleAggregateFunction(max, DateTime('UTC')),
tag SimpleAggregateFunction(groupArrayArray, Array(String)),
distinct_id String,
visitor_id String,
created_at Datetime('UTC')
)
ENGINE = AggregatingMergeTree
@ -176,6 +178,7 @@ SELECT
max_time,
tag,
distinct_id,
visitor_id,
timestamp as created_at
FROM (SELECT
website_id,
@ -214,6 +217,7 @@ FROM (SELECT
max(created_at) max_time,
arrayFilter(x -> x != '', groupArray(tag)) tag,
distinct_id,
visitor_id,
toStartOfHour(created_at) timestamp
FROM umami.website_event
GROUP BY website_id,
@ -230,6 +234,7 @@ GROUP BY website_id,
city,
event_type,
distinct_id,
visitor_id,
timestamp);
-- projections
@ -281,3 +286,16 @@ JOIN (SELECT event_id, string_value as currency
WHERE positionCaseInsensitive(data_key, 'currency') > 0) c
ON c.event_id = ed.event_id
WHERE positionCaseInsensitive(data_key, 'revenue') > 0;
-- identity linking
CREATE TABLE umami.identity_link
(
website_id UUID,
visitor_id String,
distinct_id String,
created_at DateTime('UTC'),
linked_at DateTime('UTC')
)
ENGINE = ReplacingMergeTree(linked_at)
ORDER BY (website_id, visitor_id, distinct_id)
SETTINGS index_granularity = 8192;

View file

@ -43,6 +43,7 @@ model Session {
region String? @db.VarChar(20)
city String? @db.VarChar(50)
distinctId String? @map("distinct_id") @db.VarChar(50)
visitorId String? @map("visitor_id") @db.VarChar(50)
createdAt DateTime? @default(now()) @map("created_at") @db.Timestamptz(6)
websiteEvents WebsiteEvent[]
@ -60,6 +61,7 @@ model Session {
@@index([websiteId, createdAt, country])
@@index([websiteId, createdAt, region])
@@index([websiteId, createdAt, city])
@@index([websiteId, visitorId])
@@map("session")
}
@ -84,6 +86,7 @@ model Website {
revenue Revenue[]
segments Segment[]
sessionData SessionData[]
identityLinks IdentityLink[]
@@index([userId])
@@index([teamId])
@ -316,3 +319,19 @@ model Pixel {
@@index([createdAt])
@@map("pixel")
}
model IdentityLink {
id String @id @unique @map("identity_link_id") @db.Uuid
websiteId String @map("website_id") @db.Uuid
visitorId String @map("visitor_id") @db.VarChar(50)
distinctId String @map("distinct_id") @db.VarChar(50)
createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(6)
linkedAt DateTime @default(now()) @updatedAt @map("linked_at") @db.Timestamptz(6)
website Website @relation(fields: [websiteId], references: [id], onDelete: Cascade)
@@unique([websiteId, visitorId, distinctId])
@@index([websiteId, distinctId])
@@index([websiteId, visitorId])
@@map("identity_link")
}

View file

@ -11,7 +11,7 @@ import { secret, uuid, hash } from '@/lib/crypto';
import { COLLECTION_TYPE, EVENT_TYPE } from '@/lib/constants';
import { anyObjectParam, urlOrPathParam } from '@/lib/schema';
import { safeDecodeURI, safeDecodeURIComponent } from '@/lib/url';
import { createSession, saveEvent, saveSessionData } from '@/queries/sql';
import { createSession, saveEvent, saveSessionData, createIdentityLink } from '@/queries/sql';
import { serializeError } from 'serialize-error';
interface Cache {
@ -41,6 +41,7 @@ const schema = z.object({
userAgent: z.string().optional(),
timestamp: z.coerce.number().int().optional(),
id: z.string().optional(),
vid: z.string().max(50).optional(),
})
.refine(
data => {
@ -80,6 +81,7 @@ export async function POST(request: Request) {
tag,
timestamp,
id,
vid: visitorId,
} = payload;
const sourceId = websiteId || pixelId || linkId;
@ -146,6 +148,7 @@ export async function POST(request: Request) {
region,
city,
distinctId: id,
visitorId,
createdAt,
});
}
@ -226,6 +229,7 @@ export async function POST(request: Request) {
// Session
distinctId: id,
visitorId,
browser,
os,
device,
@ -265,6 +269,19 @@ export async function POST(request: Request) {
createdAt,
});
}
// Create identity link when both visitorId and distinctId are present
// Fire-and-forget to avoid adding latency to the tracking endpoint
if (visitorId && id && websiteId) {
createIdentityLink({
websiteId,
visitorId,
distinctId: id,
}).catch(e => {
// eslint-disable-next-line no-console
console.error('Failed to create identity link:', e);
});
}
}
const token = createToken({ websiteId, sessionId, visitId, iat }, secret());

View file

@ -25,6 +25,7 @@ export interface SaveEventArgs {
// Session
distinctId?: string;
visitorId?: string;
browser?: string;
os?: string;
device?: string;
@ -164,6 +165,7 @@ async function clickhouseQuery({
referrerQuery,
referrerDomain,
distinctId,
visitorId,
browser,
os,
device,
@ -220,6 +222,7 @@ async function clickhouseQuery({
event_name: eventName ? eventName?.substring(0, EVENT_NAME_LENGTH) : null,
tag: tag,
distinct_id: distinctId,
visitor_id: visitorId,
created_at: getUTCString(createdAt),
browser,
os,

View file

@ -37,7 +37,7 @@ async function relationalQuery(
`
select
cast(coalesce(sum(t.c), 0) as bigint) as "pageviews",
count(distinct t.session_id) as "visitors",
count(distinct coalesce(t.resolved_identity, t.visitor_id, t.session_id::text)) as "visitors",
count(distinct t.visit_id) as "visits",
coalesce(sum(case when t.c = 1 then 1 else 0 end), 0) as "bounces",
cast(coalesce(sum(${getTimestampDiffSQL('t.min_time', 't.max_time')}), 0) as bigint) as "totaltime"
@ -45,17 +45,23 @@ async function relationalQuery(
select
website_event.session_id,
website_event.visit_id,
session.visitor_id,
il.distinct_id as "resolved_identity",
count(*) as "c",
min(website_event.created_at) as "min_time",
max(website_event.created_at) as "max_time"
from website_event
${cohortQuery}
${joinSessionQuery}
left join session on session.session_id = website_event.session_id
and session.website_id = website_event.website_id
left join identity_link il on il.visitor_id = session.visitor_id
and il.website_id = session.website_id
where website_event.website_id = {{websiteId::uuid}}
and website_event.created_at between {{startDate}} and {{endDate}}
and website_event.event_type != 2
${filterQuery}
group by 1, 2
group by 1, 2, 3, 4
) as t
`,
queryParams,
@ -79,47 +85,55 @@ async function clickhouseQuery(
sql = `
select
sum(t.c) as "pageviews",
uniq(t.session_id) as "visitors",
uniq(coalesce(t.resolved_identity, t.visitor_id, toString(t.session_id))) as "visitors",
uniq(t.visit_id) as "visits",
sum(if(t.c = 1, 1, 0)) as "bounces",
sum(max_time-min_time) as "totaltime"
from (
select
session_id,
visit_id,
we.session_id,
we.visit_id,
we.visitor_id,
il.distinct_id as resolved_identity,
count(*) c,
min(created_at) min_time,
max(created_at) max_time
from website_event
min(we.created_at) min_time,
max(we.created_at) max_time
from website_event we
${cohortQuery}
where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type != 2
left join identity_link final il on il.visitor_id = we.visitor_id
and il.website_id = we.website_id
where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
and we.event_type != 2
${filterQuery}
group by session_id, visit_id
group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id
) as t;
`;
} else {
sql = `
select
sum(t.c) as "pageviews",
uniq(session_id) as "visitors",
uniq(coalesce(resolved_identity, visitor_id, toString(session_id))) as "visitors",
uniq(visit_id) as "visits",
sumIf(1, t.c = 1) as "bounces",
sum(max_time-min_time) as "totaltime"
from (select
session_id,
visit_id,
sum(views) c,
min(min_time) min_time,
max(max_time) max_time
from website_event_stats_hourly "website_event"
we.session_id,
we.visit_id,
we.visitor_id,
il.distinct_id as resolved_identity,
sum(we.views) c,
min(we.min_time) min_time,
max(we.max_time) max_time
from website_event_stats_hourly we
${cohortQuery}
where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type != 2
left join identity_link final il on il.visitor_id = we.visitor_id
and il.website_id = we.website_id
where we.website_id = {websiteId:UUID}
and we.created_at between {startDate:DateTime64} and {endDate:DateTime64}
and we.event_type != 2
${filterQuery}
group by session_id, visit_id
group by we.session_id, we.visit_id, we.visitor_id, il.distinct_id
) as t;
`;
}

View file

@ -0,0 +1,76 @@
/**
* Identity Stitching - Links anonymous browser sessions to authenticated user identities
*
* Design decisions:
* - One visitor can link to multiple distinct_ids (user logs into different accounts)
* - One distinct_id can link to multiple visitors (user on multiple devices/browsers)
* - Links are additive and never invalidated (preserves historical journey)
* - Uses ReplacingMergeTree in ClickHouse with linked_at for deduplication
* - Upsert pattern ensures idempotency for repeated identify() calls
*
* Edge cases handled:
* - Safari private browsing: visitorId will be undefined, no link created
* - localStorage cleared: new visitorId generated, creates new link
* - Multiple tabs: same visitorId shared via localStorage
*/
import { uuid } from '@/lib/crypto';
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import kafka from '@/lib/kafka';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';
export interface CreateIdentityLinkArgs {
websiteId: string;
visitorId: string;
distinctId: string;
}
export async function createIdentityLink(data: CreateIdentityLinkArgs) {
return runQuery({
[PRISMA]: () => relationalQuery(data),
[CLICKHOUSE]: () => clickhouseQuery(data),
});
}
async function relationalQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) {
const { client } = prisma;
return client.identityLink.upsert({
where: {
websiteId_visitorId_distinctId: {
websiteId,
visitorId,
distinctId,
},
},
update: {
linkedAt: new Date(),
},
create: {
id: uuid(),
websiteId,
visitorId,
distinctId,
},
});
}
async function clickhouseQuery({ websiteId, visitorId, distinctId }: CreateIdentityLinkArgs) {
const { insert, getUTCString } = clickhouse;
const { sendMessage } = kafka;
const now = getUTCString(new Date());
const message = {
website_id: websiteId,
visitor_id: visitorId,
distinct_id: distinctId,
created_at: now,
linked_at: now,
};
if (kafka.enabled) {
await sendMessage('identity_link', message);
} else {
await insert('identity_link', [message]);
}
}

View file

@ -0,0 +1,71 @@
/**
* Resolves all visitor IDs linked to a given distinct_id (authenticated user)
*
* Use cases (for future implementation):
* - User journey reports: aggregate sessions across devices
* - Cohort analysis: include all linked sessions
* - Retroactive attribution: credit conversions to original anonymous session
*
* Note: Uses FINAL keyword in ClickHouse to ensure deduplication from ReplacingMergeTree
*/
import prisma from '@/lib/prisma';
import clickhouse from '@/lib/clickhouse';
import { CLICKHOUSE, PRISMA, runQuery } from '@/lib/db';
export interface GetLinkedVisitorIdsArgs {
websiteId: string;
distinctId: string;
}
export interface LinkedVisitorId {
visitorId: string;
linkedAt: Date;
}
export async function getLinkedVisitorIds(
data: GetLinkedVisitorIdsArgs,
): Promise<LinkedVisitorId[]> {
return runQuery({
[PRISMA]: () => relationalQuery(data),
[CLICKHOUSE]: () => clickhouseQuery(data),
});
}
async function relationalQuery({
websiteId,
distinctId,
}: GetLinkedVisitorIdsArgs): Promise<LinkedVisitorId[]> {
const { client } = prisma;
const links = await client.identityLink.findMany({
where: {
websiteId,
distinctId,
},
select: {
visitorId: true,
linkedAt: true,
},
});
return links;
}
async function clickhouseQuery({
websiteId,
distinctId,
}: GetLinkedVisitorIdsArgs): Promise<LinkedVisitorId[]> {
const { rawQuery } = clickhouse;
return rawQuery<LinkedVisitorId[]>(
`
select
visitor_id as visitorId,
linked_at as linkedAt
from identity_link final
where website_id = {websiteId:UUID}
and distinct_id = {distinctId:String}
`,
{ websiteId, distinctId },
);
}

View file

@ -0,0 +1,2 @@
export * from './createIdentityLink';
export * from './getLinkedVisitorIds';

View file

@ -39,3 +39,5 @@ export * from './getValues';
export * from './getWebsiteDateRange';
export * from './getWebsiteStats';
export * from './getWeeklyTraffic';
export * from './identity/createIdentityLink';
export * from './identity/getLinkedVisitorIds';

View file

@ -20,6 +20,7 @@ export async function createSession(data: Prisma.SessionCreateInput) {
region,
city,
distinct_id,
visitor_id,
created_at
)
values (
@ -34,6 +35,7 @@ export async function createSession(data: Prisma.SessionCreateInput) {
{{region}},
{{city}},
{{distinctId}},
{{visitorId}},
{{createdAt}}
)
on conflict (session_id) do nothing

View file

@ -29,6 +29,7 @@
const excludeHash = attr(_data + 'exclude-hash') === _true;
const domain = attr(_data + 'domains') || '';
const credentials = attr(_data + 'fetch-credentials') || 'omit';
const identityStitching = attr(_data + 'identity-stitching') !== _false;
const domains = domain.split(',').map(n => n.trim());
const host =
@ -41,6 +42,38 @@
/* Helper functions */
/**
* Identity Stitching: Generates a persistent visitor ID stored in localStorage.
* When combined with identify(), links anonymous sessions to authenticated users.
* Gracefully degrades when localStorage is unavailable (Safari private browsing).
*/
const generateUUID = () =>
'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => {
const r = (Math.random() * 16) | 0;
return (c === 'x' ? r : (r & 0x3) | 0x8).toString(16);
});
const getVisitorId = () => {
if (!identityStitching || !localStorage) return undefined;
try {
const storageKey = 'umami.visitor';
let vid = localStorage.getItem(storageKey);
if (!vid) {
vid = typeof crypto !== 'undefined' && crypto.randomUUID ? crypto.randomUUID() : generateUUID();
localStorage.setItem(storageKey, vid);
}
return vid;
} catch {
// localStorage access throws in Safari private browsing
return undefined;
}
};
const visitorId = getVisitorId();
const normalize = raw => {
if (!raw) return raw;
try {
@ -63,6 +96,7 @@
referrer: currentRef,
tag,
id: identity ? identity : undefined,
vid: visitorId,
});
const hasDoNotTrack = () => {
@ -141,12 +175,21 @@
/* Tracking functions */
const trackingDisabled = () =>
const trackingDisabled = () => {
let storageDisabled = false;
try {
storageDisabled = localStorage && localStorage.getItem('umami.disabled');
} catch {
// localStorage throws in Safari private browsing
}
return (
disabled ||
!website ||
(localStorage && localStorage.getItem('umami.disabled')) ||
storageDisabled ||
(domain && !domains.includes(hostname)) ||
(dnt && hasDoNotTrack());
(dnt && hasDoNotTrack())
);
};
const send = async (payload, type = 'event') => {
if (trackingDisabled()) return;