138 lines
3.9 KiB
TypeScript
138 lines
3.9 KiB
TypeScript
import type { FullContact } from "./types"
|
|
import { fullContactDisplayName } from "./types"
|
|
import { normalizeEmail } from "./find-contact"
|
|
|
|
export type DuplicateMatchReason = "email" | "phone" | "name"
|
|
|
|
export interface DuplicatePair {
|
|
contactA: FullContact
|
|
contactB: FullContact
|
|
reason: DuplicateMatchReason
|
|
}
|
|
|
|
/** Max Levenshtein distance for short strings (exact cap). */
|
|
const MAX_NAME_DISTANCE = 2
|
|
/** Min similarity ratio (1 - distance/maxLen) for longer names. */
|
|
const MIN_NAME_SIMILARITY = 0.88
|
|
const MIN_NAME_LENGTH_FOR_FUZZY = 4
|
|
const MIN_PHONE_DIGITS = 6
|
|
|
|
export function levenshteinDistance(a: string, b: string): number {
|
|
if (a === b) return 0
|
|
if (!a.length) return b.length
|
|
if (!b.length) return a.length
|
|
|
|
const rows = a.length + 1
|
|
const cols = b.length + 1
|
|
let prev = Array.from({ length: cols }, (_, i) => i)
|
|
let curr = new Array<number>(cols)
|
|
|
|
for (let i = 1; i < rows; i++) {
|
|
curr[0] = i
|
|
for (let j = 1; j < cols; j++) {
|
|
const cost = a[i - 1] === b[j - 1] ? 0 : 1
|
|
curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost)
|
|
}
|
|
;[prev, curr] = [curr, prev]
|
|
}
|
|
return prev[b.length]
|
|
}
|
|
|
|
export function normalizeContactName(name: string): string {
|
|
return name
|
|
.normalize("NFD")
|
|
.replace(/[\u0300-\u036f]/g, "")
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9\s]/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim()
|
|
}
|
|
|
|
export function normalizePhone(phone: string): string {
|
|
return phone.replace(/\D/g, "")
|
|
}
|
|
|
|
export function areNamesSimilar(a: string, b: string): boolean {
|
|
const na = normalizeContactName(a)
|
|
const nb = normalizeContactName(b)
|
|
if (!na || !nb) return false
|
|
if (na === nb) return true
|
|
|
|
const maxLen = Math.max(na.length, nb.length)
|
|
if (maxLen < MIN_NAME_LENGTH_FOR_FUZZY) return false
|
|
|
|
const distance = levenshteinDistance(na, nb)
|
|
if (distance <= MAX_NAME_DISTANCE) return true
|
|
|
|
const similarity = 1 - distance / maxLen
|
|
return similarity >= MIN_NAME_SIMILARITY
|
|
}
|
|
|
|
export function mergePairKey(idA: string, idB: string): string {
|
|
return idA < idB ? `${idA}|${idB}` : `${idB}|${idA}`
|
|
}
|
|
|
|
function collectEmails(c: FullContact): string[] {
|
|
return c.emails.map((e) => normalizeEmail(e.value)).filter(Boolean)
|
|
}
|
|
|
|
function collectPhones(c: FullContact): string[] {
|
|
return c.phones
|
|
.map((p) => normalizePhone(p.value))
|
|
.filter((d) => d.length >= MIN_PHONE_DIGITS)
|
|
}
|
|
|
|
function findDuplicateReason(a: FullContact, b: FullContact): DuplicateMatchReason | null {
|
|
const emailsA = collectEmails(a)
|
|
const emailsB = collectEmails(b)
|
|
for (const ea of emailsA) {
|
|
if (emailsB.includes(ea)) return "email"
|
|
}
|
|
|
|
const phonesA = collectPhones(a)
|
|
const phonesB = collectPhones(b)
|
|
for (const pa of phonesA) {
|
|
for (const pb of phonesB) {
|
|
if (pa === pb) return "phone"
|
|
if (pa.length >= MIN_PHONE_DIGITS && pb.length >= MIN_PHONE_DIGITS) {
|
|
const dist = levenshteinDistance(pa, pb)
|
|
const maxLen = Math.max(pa.length, pb.length)
|
|
if (dist <= 1 || 1 - dist / maxLen >= 0.95) return "phone"
|
|
}
|
|
}
|
|
}
|
|
|
|
const nameA = fullContactDisplayName(a)
|
|
const nameB = fullContactDisplayName(b)
|
|
if (nameA && nameB && areNamesSimilar(nameA, nameB)) return "name"
|
|
|
|
return null
|
|
}
|
|
|
|
export function findDuplicatePairs(
|
|
contacts: FullContact[],
|
|
ignoredKeys: ReadonlySet<string> = new Set(),
|
|
maxResults = 50
|
|
): DuplicatePair[] {
|
|
const results: DuplicatePair[] = []
|
|
const seen = new Set<string>()
|
|
|
|
for (let i = 0; i < contacts.length; i++) {
|
|
for (let j = i + 1; j < contacts.length; j++) {
|
|
const a = contacts[i]
|
|
const b = contacts[j]
|
|
const key = mergePairKey(a.id, b.id)
|
|
if (seen.has(key) || ignoredKeys.has(key)) continue
|
|
|
|
const reason = findDuplicateReason(a, b)
|
|
if (reason) {
|
|
seen.add(key)
|
|
results.push({ contactA: a, contactB: b, reason })
|
|
if (results.length >= maxResults) return results
|
|
}
|
|
}
|
|
}
|
|
|
|
return results
|
|
}
|