Enhance snippet processing in mail MIME body handling
Some checks failed
E2E / Playwright e2e (push) Has been cancelled
Some checks failed
E2E / Playwright e2e (push) Has been cancelled
- Introduced functions to normalize and analyze snippet text, including checks for legal footers, leading separators, and boilerplate detection. - Added HTML entity decoding and improved CSS snippet detection. - Updated the snippet selection logic to better filter and polish previews, ensuring cleaner output for email snippets.
This commit is contained in:
parent
8a02c10ba3
commit
a4b548ca08
@ -213,36 +213,110 @@ export function repairMimeBodies(
|
||||
}
|
||||
}
|
||||
|
||||
const SEPARATOR_RUN_RE = /-{8,}|_{8,}|={8,}|\*{8,}/g
|
||||
|
||||
function normalizeSnippetMatchText(s: string): string {
|
||||
return s
|
||||
.toLowerCase()
|
||||
.replace(/[[\]()·]/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim()
|
||||
}
|
||||
|
||||
function isViewInBrowserSnippet(s: string): boolean {
|
||||
const norm = normalizeSnippetMatchText(s)
|
||||
const phrases = [
|
||||
"afficher dans le navigateur",
|
||||
"view in browser",
|
||||
"voir ce message en ligne",
|
||||
"version en ligne",
|
||||
]
|
||||
for (const phrase of phrases) {
|
||||
if (!norm.includes(phrase)) continue
|
||||
const rest = norm.replace(phrase, "").trim()
|
||||
if (rest.length <= 20 || s.length <= 90) return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
function isSnippetLegalFooter(s: string): boolean {
|
||||
const lower = s.toLowerCase()
|
||||
if (!lower.includes("http") && !lower.includes("www.")) return false
|
||||
const markers = [
|
||||
"sas ",
|
||||
"sarl ",
|
||||
"rue ",
|
||||
" bp ",
|
||||
"kellermann",
|
||||
"ovh.com",
|
||||
"www.ovh",
|
||||
]
|
||||
let hits = 0
|
||||
for (const m of markers) {
|
||||
if (lower.includes(m)) hits++
|
||||
}
|
||||
if (hits >= 2) return true
|
||||
return lower.startsWith("sas ") && lower.includes("http")
|
||||
}
|
||||
|
||||
function hasLeadingSeparatorRun(s: string): boolean {
|
||||
const trimmed = s.trimStart()
|
||||
let run = 0
|
||||
for (const ch of trimmed) {
|
||||
if ("-_*='=·—".includes(ch)) run++
|
||||
else break
|
||||
}
|
||||
return run >= 12
|
||||
}
|
||||
|
||||
function looksLikeCssSnippet(s: string): boolean {
|
||||
const lower = s.toLowerCase()
|
||||
return (
|
||||
lower.includes(":root") ||
|
||||
lower.includes("color-scheme:") ||
|
||||
lower.includes("@media") ||
|
||||
(s.includes("{") && s.includes("}") && s.split(";").length >= 3) ||
|
||||
lower.includes("@font-face") ||
|
||||
lower.includes("font-family:") ||
|
||||
(lower.includes("facebook") && lower.includes(":root")) ||
|
||||
(lower.includes("meta for business") && s.includes("{")) ||
|
||||
lower.includes("/*//") ||
|
||||
lower.includes("||//") ||
|
||||
(s.includes("{") &&
|
||||
s.includes("}") &&
|
||||
s.includes(";") &&
|
||||
(lower.includes("font-") ||
|
||||
lower.includes("margin:") ||
|
||||
lower.includes("padding:"))) ||
|
||||
/^\s*\/\*/.test(s)
|
||||
)
|
||||
}
|
||||
|
||||
function hasUndecodedHtmlEntities(s: string): boolean {
|
||||
return HTML_ENTITY_RE.test(s)
|
||||
}
|
||||
|
||||
function isMostlySeparatorLine(s: string): boolean {
|
||||
if (s.length < 8) return false
|
||||
const sep = (s.match(/[-_*=·—]/g) ?? []).length
|
||||
return sep / s.length >= 0.6
|
||||
const sep = (s.match(/[-_*=·—|]/g) ?? []).length
|
||||
return sep / [...s].length >= 0.55
|
||||
}
|
||||
|
||||
function isSnippetBoilerplate(s: string): boolean {
|
||||
const t = stripHtmlTagsForSnippet(s.trim())
|
||||
if (!t || t.length < 4) return true
|
||||
if (looksLikeCssSnippet(t) || isMostlySeparatorLine(t)) return true
|
||||
if (looksLikeCssSnippet(t) || isMostlySeparatorLine(t) || hasLeadingSeparatorRun(t))
|
||||
return true
|
||||
if (isViewInBrowserSnippet(t) || isSnippetLegalFooter(t)) return true
|
||||
if (/<[^>]+>/.test(s)) return true
|
||||
if (hasUndecodedHtmlEntities(s)) return true
|
||||
const lower = t.toLowerCase()
|
||||
const phrases = [
|
||||
"afficher dans le navigateur",
|
||||
"view in browser",
|
||||
"si vous ne visualisez pas",
|
||||
"cliquer ici",
|
||||
]
|
||||
if (phrases.some((p) => lower.includes(p)) && t.length < 160) return true
|
||||
const phrases = ["si vous ne visualisez pas", "cliquer ici", "click here"]
|
||||
if (phrases.some((p) => lower.includes(p)) && t.length < 200) return true
|
||||
if (
|
||||
(lower.startsWith("http://") || lower.startsWith("https://")) &&
|
||||
t.length < 120
|
||||
)
|
||||
return true
|
||||
const letters = (t.match(/\p{L}|\p{N}/gu) ?? []).length
|
||||
return letters / [...t].length < 0.35
|
||||
}
|
||||
@ -251,7 +325,7 @@ function pickBestSnippetLine(lines: string[]): string {
|
||||
let best = ""
|
||||
let bestScore = -1
|
||||
for (const line of lines) {
|
||||
const t = line.trim()
|
||||
const t = stripHtmlTagsForSnippet(line.trim())
|
||||
if (!t || isSnippetBoilerplate(t)) continue
|
||||
const letters = (t.match(/\p{L}/gu) ?? []).length
|
||||
if (letters < 8) continue
|
||||
@ -265,27 +339,72 @@ function pickBestSnippetLine(lines: string[]): string {
|
||||
return best
|
||||
}
|
||||
|
||||
const HTML_ENTITY_RE =
|
||||
/&(?:#x?[0-9a-f]+|[a-z][a-z0-9]{1,8});/gi
|
||||
|
||||
function decodeHtmlEntitiesForSnippet(s: string): string {
|
||||
let out = s
|
||||
for (let i = 0; i < 4; i++) {
|
||||
const next = out.replace(HTML_ENTITY_RE, (entity) => {
|
||||
if (typeof document !== "undefined") {
|
||||
const el = document.createElement("textarea")
|
||||
el.innerHTML = entity
|
||||
return el.value
|
||||
}
|
||||
const lower = entity.toLowerCase()
|
||||
if (lower === " ") return " "
|
||||
if (lower === "&") return "&"
|
||||
if (lower === "<") return "<"
|
||||
if (lower === ">") return ">"
|
||||
if (lower === """) return '"'
|
||||
if (lower === "'") return "'"
|
||||
const dec = entity.match(/^&#(\d+);$/i)
|
||||
if (dec) return String.fromCodePoint(Number.parseInt(dec[1], 10))
|
||||
const hex = entity.match(/^&#x([0-9a-f]+);$/i)
|
||||
if (hex) return String.fromCodePoint(Number.parseInt(hex[1], 16))
|
||||
return entity
|
||||
})
|
||||
if (next === out) break
|
||||
out = next
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
function stripSnippetCssTail(s: string): string {
|
||||
for (const marker of ["/*//", "/*", "//||"]) {
|
||||
const idx = s.indexOf(marker)
|
||||
if (idx >= 0) {
|
||||
const head = s.slice(0, idx).trim()
|
||||
if (head.length >= 12) return head.replace(/[ /*\-_|]+$/g, "")
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
function splitSnippetSegments(s: string): string[] {
|
||||
return s
|
||||
.replace(/\r\n/g, "\n")
|
||||
.split("\n")
|
||||
.flatMap((line) =>
|
||||
line
|
||||
.split(SEPARATOR_RUN_RE)
|
||||
.map((p) => stripHtmlTagsForSnippet(p.trim()))
|
||||
.filter(Boolean)
|
||||
)
|
||||
}
|
||||
|
||||
function stripHtmlTagsForSnippet(s: string): string {
|
||||
const stripped = s
|
||||
let stripped = s
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, " ")
|
||||
.replace(/<[^>]+>/g, " ")
|
||||
.replace(/ /gi, " ")
|
||||
.replace(/&/gi, "&")
|
||||
.replace(/</gi, "<")
|
||||
.replace(/>/gi, ">")
|
||||
.replace(/"/gi, '"')
|
||||
.replace(/&#(\d+);/g, (_, code) =>
|
||||
String.fromCodePoint(Number.parseInt(code, 10))
|
||||
)
|
||||
.replace(/\s+/g, " ")
|
||||
.trim()
|
||||
return stripped
|
||||
stripped = decodeHtmlEntitiesForSnippet(stripped)
|
||||
stripped = stripSnippetCssTail(stripped)
|
||||
return stripped.replace(/\s+/g, " ").trim()
|
||||
}
|
||||
|
||||
function polishSnippetPreview(snippet: string): string {
|
||||
const cleaned = stripHtmlTagsForSnippet(snippet)
|
||||
const lines = cleaned.replace(/\r\n/g, "\n").split("\n")
|
||||
const best = pickBestSnippetLine(lines)
|
||||
const best = pickBestSnippetLine(splitSnippetSegments(cleaned))
|
||||
if (best) return best.length > 200 ? best.slice(0, 200) : best
|
||||
if (isSnippetBoilerplate(cleaned)) return ""
|
||||
return cleaned.length > 200 ? cleaned.slice(0, 200) : cleaned
|
||||
|
||||
Loading…
Reference in New Issue
Block a user