Enhance snippet processing in mail MIME body handling
Some checks failed
E2E / Playwright e2e (push) Has been cancelled

- Introduced functions to normalize and analyze snippet text, including checks for legal footers, leading separators, and boilerplate detection.
- Added HTML entity decoding and improved CSS snippet detection.
- Updated the snippet selection logic to better filter and polish previews, ensuring cleaner output for email snippets.
This commit is contained in:
R3D347HR4Y 2026-06-04 10:49:31 +02:00
parent 8a02c10ba3
commit a4b548ca08

View File

@ -213,36 +213,110 @@ export function repairMimeBodies(
} }
} }
const SEPARATOR_RUN_RE = /-{8,}|_{8,}|={8,}|\*{8,}/g
function normalizeSnippetMatchText(s: string): string {
return s
.toLowerCase()
.replace(/[[\]()·]/g, " ")
.replace(/\s+/g, " ")
.trim()
}
function isViewInBrowserSnippet(s: string): boolean {
const norm = normalizeSnippetMatchText(s)
const phrases = [
"afficher dans le navigateur",
"view in browser",
"voir ce message en ligne",
"version en ligne",
]
for (const phrase of phrases) {
if (!norm.includes(phrase)) continue
const rest = norm.replace(phrase, "").trim()
if (rest.length <= 20 || s.length <= 90) return true
}
return false
}
function isSnippetLegalFooter(s: string): boolean {
const lower = s.toLowerCase()
if (!lower.includes("http") && !lower.includes("www.")) return false
const markers = [
"sas ",
"sarl ",
"rue ",
" bp ",
"kellermann",
"ovh.com",
"www.ovh",
]
let hits = 0
for (const m of markers) {
if (lower.includes(m)) hits++
}
if (hits >= 2) return true
return lower.startsWith("sas ") && lower.includes("http")
}
function hasLeadingSeparatorRun(s: string): boolean {
const trimmed = s.trimStart()
let run = 0
for (const ch of trimmed) {
if ("-_*='=·—".includes(ch)) run++
else break
}
return run >= 12
}
function looksLikeCssSnippet(s: string): boolean { function looksLikeCssSnippet(s: string): boolean {
const lower = s.toLowerCase() const lower = s.toLowerCase()
return ( return (
lower.includes(":root") || lower.includes(":root") ||
lower.includes("color-scheme:") || lower.includes("color-scheme:") ||
lower.includes("@media") || lower.includes("@media") ||
(s.includes("{") && s.includes("}") && s.split(";").length >= 3) || lower.includes("@font-face") ||
lower.includes("font-family:") ||
(lower.includes("facebook") && lower.includes(":root")) ||
(lower.includes("meta for business") && s.includes("{")) ||
lower.includes("/*//") ||
lower.includes("||//") ||
(s.includes("{") &&
s.includes("}") &&
s.includes(";") &&
(lower.includes("font-") ||
lower.includes("margin:") ||
lower.includes("padding:"))) ||
/^\s*\/\*/.test(s) /^\s*\/\*/.test(s)
) )
} }
function hasUndecodedHtmlEntities(s: string): boolean {
return HTML_ENTITY_RE.test(s)
}
function isMostlySeparatorLine(s: string): boolean { function isMostlySeparatorLine(s: string): boolean {
if (s.length < 8) return false if (s.length < 8) return false
const sep = (s.match(/[-_*=·—]/g) ?? []).length const sep = (s.match(/[-_*=·—|]/g) ?? []).length
return sep / s.length >= 0.6 return sep / [...s].length >= 0.55
} }
function isSnippetBoilerplate(s: string): boolean { function isSnippetBoilerplate(s: string): boolean {
const t = stripHtmlTagsForSnippet(s.trim()) const t = stripHtmlTagsForSnippet(s.trim())
if (!t || t.length < 4) return true if (!t || t.length < 4) return true
if (looksLikeCssSnippet(t) || isMostlySeparatorLine(t)) return true if (looksLikeCssSnippet(t) || isMostlySeparatorLine(t) || hasLeadingSeparatorRun(t))
return true
if (isViewInBrowserSnippet(t) || isSnippetLegalFooter(t)) return true
if (/<[^>]+>/.test(s)) return true if (/<[^>]+>/.test(s)) return true
if (hasUndecodedHtmlEntities(s)) return true
const lower = t.toLowerCase() const lower = t.toLowerCase()
const phrases = [ const phrases = ["si vous ne visualisez pas", "cliquer ici", "click here"]
"afficher dans le navigateur", if (phrases.some((p) => lower.includes(p)) && t.length < 200) return true
"view in browser", if (
"si vous ne visualisez pas", (lower.startsWith("http://") || lower.startsWith("https://")) &&
"cliquer ici", t.length < 120
] )
if (phrases.some((p) => lower.includes(p)) && t.length < 160) return true return true
const letters = (t.match(/\p{L}|\p{N}/gu) ?? []).length const letters = (t.match(/\p{L}|\p{N}/gu) ?? []).length
return letters / [...t].length < 0.35 return letters / [...t].length < 0.35
} }
@ -251,7 +325,7 @@ function pickBestSnippetLine(lines: string[]): string {
let best = "" let best = ""
let bestScore = -1 let bestScore = -1
for (const line of lines) { for (const line of lines) {
const t = line.trim() const t = stripHtmlTagsForSnippet(line.trim())
if (!t || isSnippetBoilerplate(t)) continue if (!t || isSnippetBoilerplate(t)) continue
const letters = (t.match(/\p{L}/gu) ?? []).length const letters = (t.match(/\p{L}/gu) ?? []).length
if (letters < 8) continue if (letters < 8) continue
@ -265,27 +339,72 @@ function pickBestSnippetLine(lines: string[]): string {
return best return best
} }
const HTML_ENTITY_RE =
/&(?:#x?[0-9a-f]+|[a-z][a-z0-9]{1,8});/gi
function decodeHtmlEntitiesForSnippet(s: string): string {
let out = s
for (let i = 0; i < 4; i++) {
const next = out.replace(HTML_ENTITY_RE, (entity) => {
if (typeof document !== "undefined") {
const el = document.createElement("textarea")
el.innerHTML = entity
return el.value
}
const lower = entity.toLowerCase()
if (lower === "&nbsp;") return " "
if (lower === "&amp;") return "&"
if (lower === "&lt;") return "<"
if (lower === "&gt;") return ">"
if (lower === "&quot;") return '"'
if (lower === "&apos;") return "'"
const dec = entity.match(/^&#(\d+);$/i)
if (dec) return String.fromCodePoint(Number.parseInt(dec[1], 10))
const hex = entity.match(/^&#x([0-9a-f]+);$/i)
if (hex) return String.fromCodePoint(Number.parseInt(hex[1], 16))
return entity
})
if (next === out) break
out = next
}
return out
}
function stripSnippetCssTail(s: string): string {
for (const marker of ["/*//", "/*", "//||"]) {
const idx = s.indexOf(marker)
if (idx >= 0) {
const head = s.slice(0, idx).trim()
if (head.length >= 12) return head.replace(/[ /*\-_|]+$/g, "")
}
}
return s
}
function splitSnippetSegments(s: string): string[] {
return s
.replace(/\r\n/g, "\n")
.split("\n")
.flatMap((line) =>
line
.split(SEPARATOR_RUN_RE)
.map((p) => stripHtmlTagsForSnippet(p.trim()))
.filter(Boolean)
)
}
function stripHtmlTagsForSnippet(s: string): string { function stripHtmlTagsForSnippet(s: string): string {
const stripped = s let stripped = s
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, " ") .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, " ")
.replace(/<[^>]+>/g, " ") .replace(/<[^>]+>/g, " ")
.replace(/&nbsp;/gi, " ") stripped = decodeHtmlEntitiesForSnippet(stripped)
.replace(/&amp;/gi, "&") stripped = stripSnippetCssTail(stripped)
.replace(/&lt;/gi, "<") return stripped.replace(/\s+/g, " ").trim()
.replace(/&gt;/gi, ">")
.replace(/&quot;/gi, '"')
.replace(/&#(\d+);/g, (_, code) =>
String.fromCodePoint(Number.parseInt(code, 10))
)
.replace(/\s+/g, " ")
.trim()
return stripped
} }
function polishSnippetPreview(snippet: string): string { function polishSnippetPreview(snippet: string): string {
const cleaned = stripHtmlTagsForSnippet(snippet) const cleaned = stripHtmlTagsForSnippet(snippet)
const lines = cleaned.replace(/\r\n/g, "\n").split("\n") const best = pickBestSnippetLine(splitSnippetSegments(cleaned))
const best = pickBestSnippetLine(lines)
if (best) return best.length > 200 ? best.slice(0, 200) : best if (best) return best.length > 200 ? best.slice(0, 200) : best
if (isSnippetBoilerplate(cleaned)) return "" if (isSnippetBoilerplate(cleaned)) return ""
return cleaned.length > 200 ? cleaned.slice(0, 200) : cleaned return cleaned.length > 200 ? cleaned.slice(0, 200) : cleaned