Enhance snippet processing in mail MIME body handling

- Introduced functions to normalize and analyze snippet text, including checks for legal footers, leading separators, and boilerplate detection. - Added HTML entity decoding and improved CSS snippet detection. - Updated the snippet selection logic to better filter and polish previews, ensuring cleaner output for email snippets.
2026-06-04 10:49:31 +02:00 · 2026-06-04 10:49:31 +02:00 · a4b548ca08
commit a4b548ca08
parent 8a02c10ba3
1 changed files with 145 additions and 26 deletions
--- a/lib/mail-mime-body.ts
+++ b/lib/mail-mime-body.ts
@ -213,36 +213,110 @@ export function repairMimeBodies(
  }
 }

+const SEPARATOR_RUN_RE = /-{8,}|_{8,}|={8,}|\*{8,}/g
+
+function normalizeSnippetMatchText(s: string): string {
+  return s
+    .toLowerCase()
+    .replace(/[[\]()·]/g, " ")
+    .replace(/\s+/g, " ")
+    .trim()
+}
+
+function isViewInBrowserSnippet(s: string): boolean {
+  const norm = normalizeSnippetMatchText(s)
+  const phrases = [
+    "afficher dans le navigateur",
+    "view in browser",
+    "voir ce message en ligne",
+    "version en ligne",
+  ]
+  for (const phrase of phrases) {
+    if (!norm.includes(phrase)) continue
+    const rest = norm.replace(phrase, "").trim()
+    if (rest.length <= 20 || s.length <= 90) return true
+  }
+  return false
+}
+
+function isSnippetLegalFooter(s: string): boolean {
+  const lower = s.toLowerCase()
+  if (!lower.includes("http") && !lower.includes("www.")) return false
+  const markers = [
+    "sas ",
+    "sarl ",
+    "rue ",
+    " bp ",
+    "kellermann",
+    "ovh.com",
+    "www.ovh",
+  ]
+  let hits = 0
+  for (const m of markers) {
+    if (lower.includes(m)) hits++
+  }
+  if (hits >= 2) return true
+  return lower.startsWith("sas ") && lower.includes("http")
+}
+
+function hasLeadingSeparatorRun(s: string): boolean {
+  const trimmed = s.trimStart()
+  let run = 0
+  for (const ch of trimmed) {
+    if ("-_*='=·—".includes(ch)) run++
+    else break
+  }
+  return run >= 12
+}
+
 function looksLikeCssSnippet(s: string): boolean {
  const lower = s.toLowerCase()
  return (
    lower.includes(":root") ||
    lower.includes("color-scheme:") ||
    lower.includes("@media") ||
-    (s.includes("{") && s.includes("}") && s.split(";").length >= 3) ||
+    lower.includes("@font-face") ||
+    lower.includes("font-family:") ||
+    (lower.includes("facebook") && lower.includes(":root")) ||
+    (lower.includes("meta for business") && s.includes("{")) ||
+    lower.includes("/*//") ||
+    lower.includes("||//") ||
+    (s.includes("{") &&
+      s.includes("}") &&
+      s.includes(";") &&
+      (lower.includes("font-") ||
+        lower.includes("margin:") ||
+        lower.includes("padding:"))) ||
    /^\s*\/\*/.test(s)
  )
 }

+function hasUndecodedHtmlEntities(s: string): boolean {
+  return HTML_ENTITY_RE.test(s)
+}
+
 function isMostlySeparatorLine(s: string): boolean {
  if (s.length < 8) return false
-  const sep = (s.match(/[-_*=·—]/g) ?? []).length
-  return sep / s.length >= 0.6
+  const sep = (s.match(/[-_*=·—|]/g) ?? []).length
+  return sep / [...s].length >= 0.55
 }

 function isSnippetBoilerplate(s: string): boolean {
  const t = stripHtmlTagsForSnippet(s.trim())
  if (!t || t.length < 4) return true
-  if (looksLikeCssSnippet(t) || isMostlySeparatorLine(t)) return true
+  if (looksLikeCssSnippet(t) || isMostlySeparatorLine(t) || hasLeadingSeparatorRun(t))
+    return true
+  if (isViewInBrowserSnippet(t) || isSnippetLegalFooter(t)) return true
  if (/<[^>]+>/.test(s)) return true
+  if (hasUndecodedHtmlEntities(s)) return true
  const lower = t.toLowerCase()
-  const phrases = [
-    "afficher dans le navigateur",
-    "view in browser",
-    "si vous ne visualisez pas",
-    "cliquer ici",
-  ]
-  if (phrases.some((p) => lower.includes(p)) && t.length < 160) return true
+  const phrases = ["si vous ne visualisez pas", "cliquer ici", "click here"]
+  if (phrases.some((p) => lower.includes(p)) && t.length < 200) return true
+  if (
+    (lower.startsWith("http://") || lower.startsWith("https://")) &&
+    t.length < 120
+  )
+    return true
  const letters = (t.match(/\p{L}|\p{N}/gu) ?? []).length
  return letters / [...t].length < 0.35
 }
@ -251,7 +325,7 @@ function pickBestSnippetLine(lines: string[]): string {
  let best = ""
  let bestScore = -1
  for (const line of lines) {
-    const t = line.trim()
+    const t = stripHtmlTagsForSnippet(line.trim())
    if (!t || isSnippetBoilerplate(t)) continue
    const letters = (t.match(/\p{L}/gu) ?? []).length
    if (letters < 8) continue
@ -265,27 +339,72 @@ function pickBestSnippetLine(lines: string[]): string {
  return best
 }

+const HTML_ENTITY_RE =
+  /&(?:#x?[0-9a-f]+|[a-z][a-z0-9]{1,8});/gi
+
+function decodeHtmlEntitiesForSnippet(s: string): string {
+  let out = s
+  for (let i = 0; i < 4; i++) {
+    const next = out.replace(HTML_ENTITY_RE, (entity) => {
+      if (typeof document !== "undefined") {
+        const el = document.createElement("textarea")
+        el.innerHTML = entity
+        return el.value
+      }
+      const lower = entity.toLowerCase()
+      if (lower === "&nbsp;") return " "
+      if (lower === "&amp;") return "&"
+      if (lower === "&lt;") return "<"
+      if (lower === "&gt;") return ">"
+      if (lower === "&quot;") return '"'
+      if (lower === "&apos;") return "'"
+      const dec = entity.match(/^&#(\d+);$/i)
+      if (dec) return String.fromCodePoint(Number.parseInt(dec[1], 10))
+      const hex = entity.match(/^&#x([0-9a-f]+);$/i)
+      if (hex) return String.fromCodePoint(Number.parseInt(hex[1], 16))
+      return entity
+    })
+    if (next === out) break
+    out = next
+  }
+  return out
+}
+
+function stripSnippetCssTail(s: string): string {
+  for (const marker of ["/*//", "/*", "//||"]) {
+    const idx = s.indexOf(marker)
+    if (idx >= 0) {
+      const head = s.slice(0, idx).trim()
+      if (head.length >= 12) return head.replace(/[ /*\-_|]+$/g, "")
+    }
+  }
+  return s
+}
+
+function splitSnippetSegments(s: string): string[] {
+  return s
+    .replace(/\r\n/g, "\n")
+    .split("\n")
+    .flatMap((line) =>
+      line
+        .split(SEPARATOR_RUN_RE)
+        .map((p) => stripHtmlTagsForSnippet(p.trim()))
+        .filter(Boolean)
+    )
+}
+
 function stripHtmlTagsForSnippet(s: string): string {
-  const stripped = s
+  let stripped = s
    .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, " ")
    .replace(/<[^>]+>/g, " ")
-    .replace(/&nbsp;/gi, " ")
-    .replace(/&amp;/gi, "&")
-    .replace(/&lt;/gi, "<")
-    .replace(/&gt;/gi, ">")
-    .replace(/&quot;/gi, '"')
-    .replace(/&#(\d+);/g, (_, code) =>
-      String.fromCodePoint(Number.parseInt(code, 10))
-    )
-    .replace(/\s+/g, " ")
-    .trim()
-  return stripped
+  stripped = decodeHtmlEntitiesForSnippet(stripped)
+  stripped = stripSnippetCssTail(stripped)
+  return stripped.replace(/\s+/g, " ").trim()
 }

 function polishSnippetPreview(snippet: string): string {
  const cleaned = stripHtmlTagsForSnippet(snippet)
-  const lines = cleaned.replace(/\r\n/g, "\n").split("\n")
-  const best = pickBestSnippetLine(lines)
+  const best = pickBestSnippetLine(splitSnippetSegments(cleaned))
  if (best) return best.length > 200 ? best.slice(0, 200) : best
  if (isSnippetBoilerplate(cleaned)) return ""
  return cleaned.length > 200 ? cleaned.slice(0, 200) : cleaned