From 69bde44b94906658c6ae26858ced5009db23e606 Mon Sep 17 00:00:00 2001 From: R3D347HR4Y Date: Thu, 4 Jun 2026 10:49:22 +0200 Subject: [PATCH] Refactor snippet processing and enhance boilerplate detection - Improved the `RepairSnippetWithBodies` function to streamline snippet rebuilding logic and reduce redundancy. - Introduced new utility functions for stripping CSS noise and decoding HTML entities in snippets. - Enhanced boilerplate detection to better identify low-quality snippets, including legal footers and view-in-browser prompts. - Added comprehensive tests for new functionality and edge cases in snippet processing. --- internal/mail/imap/body_repair.go | 23 ++-- internal/mail/imap/snippet.go | 193 +++++++++++++++++++++++++---- internal/mail/imap/snippet_test.go | 83 +++++++++++++ 3 files changed, 259 insertions(+), 40 deletions(-) diff --git a/internal/mail/imap/body_repair.go b/internal/mail/imap/body_repair.go index 54c90f8..c670f2d 100644 --- a/internal/mail/imap/body_repair.go +++ b/internal/mail/imap/body_repair.go @@ -51,12 +51,12 @@ func RepairSnippet(snippet string) string { func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string { snippet = stripSnippetMarkup(snippet) if decoded := decodeBareQuotedPrintableIfNeeded(snippet); decoded != snippet { - snippet = decoded + snippet = stripSnippetMarkup(decoded) } if decoded := decodeBareBase64IfNeeded(snippet); decoded != snippet { - snippet = decoded + snippet = stripSnippetMarkup(decoded) } - snippet = stripPlainTextPreheaderPadding(snippet) + snippet = stripSnippetMarkup(stripPlainTextPreheaderPadding(snippet)) if looksLikeRawMIME(snippet) { t, h, ok := parseEmbeddedMIME([]byte(snippet)) if ok { @@ -65,16 +65,13 @@ func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string { } bodyText, bodyHTML = RepairStoredBodies(bodyText, bodyHTML) if bodyText != "" || bodyHTML != "" { - if rebuilt := SnippetFromBodies(bodyText, bodyHTML, 200); rebuilt != "" { - if SnippetLooksLowQuality(snippet) || snippet == "" { - return rebuilt - } - if SnippetLooksLowQuality(rebuilt) { - return snippet - } - if snippetLineScore(rebuilt) > snippetLineScore(snippet) { - return rebuilt - } + rebuilt := SnippetFromBodies(bodyText, bodyHTML, 200) + storedBad := snippet == "" || SnippetLooksLowQuality(snippet) || isSnippetBoilerplate(snippet) + if rebuilt != "" && (storedBad || snippetLineScore(rebuilt) > snippetLineScore(snippet)) { + return rebuilt + } + if storedBad { + return rebuilt } } if snippet == "" { diff --git a/internal/mail/imap/snippet.go b/internal/mail/imap/snippet.go index d6445f4..7516da4 100644 --- a/internal/mail/imap/snippet.go +++ b/internal/mail/imap/snippet.go @@ -11,7 +11,12 @@ import ( "github.com/ultisuite/ulti-backend/internal/mail/sanitize" ) -var snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`) +var ( + snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`) + snippetHTMLEntityRE = regexp.MustCompile(`(?i)(?:&#x?[0-9a-f]+;|&[a-z][a-z0-9]{1,8};)`) + snippetStyleBlockRE = regexp.MustCompile(`(?is)]*>.*?`) + snippetSeparatorRunRE = regexp.MustCompile(`-{8,}|_{8,}|={8,}|\*{8,}`) +) var snippetSkipTags = map[string]bool{ "script": true, "style": true, "head": true, "noscript": true, @@ -28,13 +33,21 @@ var snippetBlockTags = map[string]bool{ func SnippetFromBodies(text, html string, maxLen int) string { candidates := snippetCandidates(text, html) best := pickBestSnippetLine(candidates) + if best == "" && strings.TrimSpace(html) != "" { + flat := stripSnippetMarkup(stripHTMLForSnippet(stripStyleBlocksFromHTML(html))) + best = pickBestSnippetLine(splitSnippetSegments(flat)) + } if best == "" { return "" } return truncate(stripSnippetMarkup(best), maxLen) } -// stripSnippetMarkup removes HTML tags and entities from preview text. +func stripStyleBlocksFromHTML(html string) string { + return snippetStyleBlockRE.ReplaceAllString(html, " ") +} + +// stripSnippetMarkup removes HTML tags, entities, and CSS noise from preview text. func stripSnippetMarkup(s string) string { s = strings.TrimSpace(s) if s == "" { @@ -43,10 +56,44 @@ func stripSnippetMarkup(s string) string { if snippetHTMLTagRE.MatchString(s) { s = snippetHTMLTagRE.ReplaceAllString(s, " ") } - s = stdhtml.UnescapeString(s) + s = unescapeSnippetEntities(s) + s = stripSnippetCSSTail(s) return strings.Join(strings.Fields(s), " ") } +func unescapeSnippetEntities(s string) string { + for i := 0; i < 4; i++ { + prev := s + s = stdhtml.UnescapeString(s) + s = snippetHTMLEntityRE.ReplaceAllStringFunc(s, func(entity string) string { + return stdhtml.UnescapeString(entity) + }) + if s == prev { + break + } + } + return s +} + +// stripSnippetCSSTail removes trailing CSS comment junk often leaked into stored snippets. +func stripSnippetCSSTail(s string) string { + s = strings.TrimSpace(s) + for _, marker := range []string{"/*//", "/*", "//||"} { + if idx := strings.Index(s, marker); idx >= 0 { + head := strings.TrimSpace(s[:idx]) + if len(head) >= 12 { + s = head + break + } + } + } + return strings.TrimRight(s, " /*-|_") +} + +func hasUndecodedHTMLEntities(s string) bool { + return snippetHTMLEntityRE.MatchString(s) +} + func snippetCandidates(text, html string) []string { var out []string text = strings.TrimSpace(stripPlainTextPreheaderPadding(text)) @@ -64,11 +111,13 @@ func splitSnippetSegments(s string) []string { }) var segments []string for _, line := range raw { - line = stripSnippetMarkup(line) - if line == "" { - continue + for _, part := range snippetSeparatorRunRE.Split(line, -1) { + part = stripSnippetMarkup(part) + if part == "" { + continue + } + segments = append(segments, part) } - segments = append(segments, line) } return segments } @@ -79,6 +128,7 @@ func htmlSnippetCandidates(raw string) []string { return nil } raw = sanitize.StripHiddenEmailHTML(raw) + raw = stripStyleBlocksFromHTML(raw) doc, err := html.Parse(strings.NewReader(raw)) if err != nil { if flat := strings.TrimSpace(stripHTMLForSnippet(raw)); flat != "" { @@ -164,6 +214,9 @@ func pickBestSnippetLine(candidates []string) string { } func snippetLineScore(s string) int { + if isSnippetBoilerplate(s) { + return 0 + } letters := 0 for _, r := range s { if unicode.IsLetter(r) { @@ -180,6 +233,9 @@ func snippetLineScore(s string) int { if len(s) >= 280 { score += 10 } + if strings.Contains(strings.ToLower(s), "http://") || strings.Contains(strings.ToLower(s), "https://") { + score -= 30 + } return score } @@ -188,18 +244,17 @@ func isSnippetBoilerplate(s string) bool { if s == "" || len(s) < 4 { return true } - lower := strings.ToLower(s) if looksLikeCSSSnippet(s) { return true } - if isMostlySeparatorLine(s) { + if isMostlySeparatorLine(s) || hasLeadingSeparatorRun(s) { return true } + if isViewInBrowserSnippet(s) || isSnippetLegalFooter(s) { + return true + } + lower := strings.ToLower(s) boilerplate := []string{ - "afficher dans le navigateur", - "view in browser", - "view this email in your browser", - "voir ce message en ligne", "si vous ne visualisez pas", "si vous n'arrivez pas à lire", "si vous n'arrivez pas a lire", @@ -214,39 +269,116 @@ func isSnippetBoilerplate(s string) bool { "gérer vos préférences", } for _, phrase := range boilerplate { - if strings.Contains(lower, phrase) && len(s) < 160 { + if strings.Contains(lower, phrase) && len(s) < 200 { return true } } if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") { - return len(s) < 100 - } - if strings.HasPrefix(s, "[") && strings.HasSuffix(s, "]") && len(s) < 80 { - return true + return len(s) < 120 } letterRatio := snippetLetterRatio(s) return letterRatio < 0.35 } +func normalizeSnippetMatchText(s string) string { + s = strings.ToLower(s) + s = strings.NewReplacer("[", " ", "]", " ", "(", " ", ")", " ", "·", " ").Replace(s) + return strings.Join(strings.Fields(s), " ") +} + +func isViewInBrowserSnippet(s string) bool { + norm := normalizeSnippetMatchText(s) + phrases := []string{ + "afficher dans le navigateur", + "view in browser", + "view this email in your browser", + "voir ce message en ligne", + "voir la version en ligne", + "version en ligne", + } + for _, phrase := range phrases { + if !strings.Contains(norm, phrase) { + continue + } + rest := strings.TrimSpace(strings.ReplaceAll(norm, phrase, "")) + if len(rest) <= 20 || len(s) <= 90 { + return true + } + } + return false +} + +func isSnippetLegalFooter(s string) bool { + lower := strings.ToLower(s) + if !strings.Contains(lower, "http") && !strings.Contains(lower, "www.") { + return false + } + markers := []string{ + "sas ", "sarl ", " sa ", " sas.", "rue ", " bp ", "rcs ", + "kellermann", "ovh.com", "www.ovh", + } + hits := 0 + for _, m := range markers { + if strings.Contains(lower, m) { + hits++ + } + } + if hits >= 2 { + return true + } + if strings.Contains(lower, "https://") && (strings.Contains(lower, "rue ") || strings.Contains(lower, " bp ")) { + return true + } + return strings.HasPrefix(lower, "sas ") && strings.Contains(lower, "http") +} + +func hasLeadingSeparatorRun(s string) bool { + trimmed := strings.TrimSpace(s) + if len(trimmed) < 12 { + return false + } + run := 0 + for _, r := range trimmed { + switch r { + case '-', '_', '*', '=', '·', '—': + run++ + default: + goto done + } + } +done: + return run >= 12 +} + func looksLikeCSSSnippet(s string) bool { lower := strings.ToLower(s) if strings.Contains(lower, ":root") || strings.Contains(lower, "color-scheme:") || strings.Contains(lower, "@media") || + strings.Contains(lower, "@font-face") || + strings.Contains(lower, "font-family:") || strings.Contains(lower, "