ultisuite-client/lib/contacts/duplicate-detection.ts
R3D347HR4Y 77f99d8d8a hehe
2026-05-19 00:48:20 +02:00

138 lines
3.9 KiB
TypeScript

import type { FullContact } from "./types"
import { fullContactDisplayName } from "./types"
import { normalizeEmail } from "./find-contact"
export type DuplicateMatchReason = "email" | "phone" | "name"
export interface DuplicatePair {
contactA: FullContact
contactB: FullContact
reason: DuplicateMatchReason
}
/** Max Levenshtein distance for short strings (exact cap). */
const MAX_NAME_DISTANCE = 2
/** Min similarity ratio (1 - distance/maxLen) for longer names. */
const MIN_NAME_SIMILARITY = 0.88
const MIN_NAME_LENGTH_FOR_FUZZY = 4
const MIN_PHONE_DIGITS = 6
export function levenshteinDistance(a: string, b: string): number {
if (a === b) return 0
if (!a.length) return b.length
if (!b.length) return a.length
const rows = a.length + 1
const cols = b.length + 1
let prev = Array.from({ length: cols }, (_, i) => i)
let curr = new Array<number>(cols)
for (let i = 1; i < rows; i++) {
curr[0] = i
for (let j = 1; j < cols; j++) {
const cost = a[i - 1] === b[j - 1] ? 0 : 1
curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost)
}
;[prev, curr] = [curr, prev]
}
return prev[b.length]
}
export function normalizeContactName(name: string): string {
return name
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.toLowerCase()
.replace(/[^a-z0-9\s]/g, " ")
.replace(/\s+/g, " ")
.trim()
}
export function normalizePhone(phone: string): string {
return phone.replace(/\D/g, "")
}
export function areNamesSimilar(a: string, b: string): boolean {
const na = normalizeContactName(a)
const nb = normalizeContactName(b)
if (!na || !nb) return false
if (na === nb) return true
const maxLen = Math.max(na.length, nb.length)
if (maxLen < MIN_NAME_LENGTH_FOR_FUZZY) return false
const distance = levenshteinDistance(na, nb)
if (distance <= MAX_NAME_DISTANCE) return true
const similarity = 1 - distance / maxLen
return similarity >= MIN_NAME_SIMILARITY
}
export function mergePairKey(idA: string, idB: string): string {
return idA < idB ? `${idA}|${idB}` : `${idB}|${idA}`
}
function collectEmails(c: FullContact): string[] {
return c.emails.map((e) => normalizeEmail(e.value)).filter(Boolean)
}
function collectPhones(c: FullContact): string[] {
return c.phones
.map((p) => normalizePhone(p.value))
.filter((d) => d.length >= MIN_PHONE_DIGITS)
}
function findDuplicateReason(a: FullContact, b: FullContact): DuplicateMatchReason | null {
const emailsA = collectEmails(a)
const emailsB = collectEmails(b)
for (const ea of emailsA) {
if (emailsB.includes(ea)) return "email"
}
const phonesA = collectPhones(a)
const phonesB = collectPhones(b)
for (const pa of phonesA) {
for (const pb of phonesB) {
if (pa === pb) return "phone"
if (pa.length >= MIN_PHONE_DIGITS && pb.length >= MIN_PHONE_DIGITS) {
const dist = levenshteinDistance(pa, pb)
const maxLen = Math.max(pa.length, pb.length)
if (dist <= 1 || 1 - dist / maxLen >= 0.95) return "phone"
}
}
}
const nameA = fullContactDisplayName(a)
const nameB = fullContactDisplayName(b)
if (nameA && nameB && areNamesSimilar(nameA, nameB)) return "name"
return null
}
export function findDuplicatePairs(
contacts: FullContact[],
ignoredKeys: ReadonlySet<string> = new Set(),
maxResults = 50
): DuplicatePair[] {
const results: DuplicatePair[] = []
const seen = new Set<string>()
for (let i = 0; i < contacts.length; i++) {
for (let j = i + 1; j < contacts.length; j++) {
const a = contacts[i]
const b = contacts[j]
const key = mergePairKey(a.id, b.id)
if (seen.has(key) || ignoredKeys.has(key)) continue
const reason = findDuplicateReason(a, b)
if (reason) {
seen.add(key)
results.push({ contactA: a, contactB: b, reason })
if (results.length >= maxResults) return results
}
}
}
return results
}