Refactor snippet processing and enhance boilerplate detection

- Improved the `RepairSnippetWithBodies` function to streamline snippet rebuilding logic and reduce redundancy.
- Introduced new utility functions for stripping CSS noise and decoding HTML entities in snippets.
- Enhanced boilerplate detection to better identify low-quality snippets, including legal footers and view-in-browser prompts.
- Added comprehensive tests for new functionality and edge cases in snippet processing.
This commit is contained in:
R3D347HR4Y 2026-06-04 10:49:22 +02:00
parent 25d3ac4cd9
commit 69bde44b94
3 changed files with 259 additions and 40 deletions

View File

@ -51,12 +51,12 @@ func RepairSnippet(snippet string) string {
func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string { func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string {
snippet = stripSnippetMarkup(snippet) snippet = stripSnippetMarkup(snippet)
if decoded := decodeBareQuotedPrintableIfNeeded(snippet); decoded != snippet { if decoded := decodeBareQuotedPrintableIfNeeded(snippet); decoded != snippet {
snippet = decoded snippet = stripSnippetMarkup(decoded)
} }
if decoded := decodeBareBase64IfNeeded(snippet); decoded != snippet { if decoded := decodeBareBase64IfNeeded(snippet); decoded != snippet {
snippet = decoded snippet = stripSnippetMarkup(decoded)
} }
snippet = stripPlainTextPreheaderPadding(snippet) snippet = stripSnippetMarkup(stripPlainTextPreheaderPadding(snippet))
if looksLikeRawMIME(snippet) { if looksLikeRawMIME(snippet) {
t, h, ok := parseEmbeddedMIME([]byte(snippet)) t, h, ok := parseEmbeddedMIME([]byte(snippet))
if ok { if ok {
@ -65,18 +65,15 @@ func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string {
} }
bodyText, bodyHTML = RepairStoredBodies(bodyText, bodyHTML) bodyText, bodyHTML = RepairStoredBodies(bodyText, bodyHTML)
if bodyText != "" || bodyHTML != "" { if bodyText != "" || bodyHTML != "" {
if rebuilt := SnippetFromBodies(bodyText, bodyHTML, 200); rebuilt != "" { rebuilt := SnippetFromBodies(bodyText, bodyHTML, 200)
if SnippetLooksLowQuality(snippet) || snippet == "" { storedBad := snippet == "" || SnippetLooksLowQuality(snippet) || isSnippetBoilerplate(snippet)
if rebuilt != "" && (storedBad || snippetLineScore(rebuilt) > snippetLineScore(snippet)) {
return rebuilt return rebuilt
} }
if SnippetLooksLowQuality(rebuilt) { if storedBad {
return snippet
}
if snippetLineScore(rebuilt) > snippetLineScore(snippet) {
return rebuilt return rebuilt
} }
} }
}
if snippet == "" { if snippet == "" {
return "" return ""
} }

View File

@ -11,7 +11,12 @@ import (
"github.com/ultisuite/ulti-backend/internal/mail/sanitize" "github.com/ultisuite/ulti-backend/internal/mail/sanitize"
) )
var snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`) var (
snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`)
snippetHTMLEntityRE = regexp.MustCompile(`(?i)(?:&#x?[0-9a-f]+;|&[a-z][a-z0-9]{1,8};)`)
snippetStyleBlockRE = regexp.MustCompile(`(?is)<style[^>]*>.*?</style>`)
snippetSeparatorRunRE = regexp.MustCompile(`-{8,}|_{8,}|={8,}|\*{8,}`)
)
var snippetSkipTags = map[string]bool{ var snippetSkipTags = map[string]bool{
"script": true, "style": true, "head": true, "noscript": true, "script": true, "style": true, "head": true, "noscript": true,
@ -28,13 +33,21 @@ var snippetBlockTags = map[string]bool{
func SnippetFromBodies(text, html string, maxLen int) string { func SnippetFromBodies(text, html string, maxLen int) string {
candidates := snippetCandidates(text, html) candidates := snippetCandidates(text, html)
best := pickBestSnippetLine(candidates) best := pickBestSnippetLine(candidates)
if best == "" && strings.TrimSpace(html) != "" {
flat := stripSnippetMarkup(stripHTMLForSnippet(stripStyleBlocksFromHTML(html)))
best = pickBestSnippetLine(splitSnippetSegments(flat))
}
if best == "" { if best == "" {
return "" return ""
} }
return truncate(stripSnippetMarkup(best), maxLen) return truncate(stripSnippetMarkup(best), maxLen)
} }
// stripSnippetMarkup removes HTML tags and entities from preview text. func stripStyleBlocksFromHTML(html string) string {
return snippetStyleBlockRE.ReplaceAllString(html, " ")
}
// stripSnippetMarkup removes HTML tags, entities, and CSS noise from preview text.
func stripSnippetMarkup(s string) string { func stripSnippetMarkup(s string) string {
s = strings.TrimSpace(s) s = strings.TrimSpace(s)
if s == "" { if s == "" {
@ -43,10 +56,44 @@ func stripSnippetMarkup(s string) string {
if snippetHTMLTagRE.MatchString(s) { if snippetHTMLTagRE.MatchString(s) {
s = snippetHTMLTagRE.ReplaceAllString(s, " ") s = snippetHTMLTagRE.ReplaceAllString(s, " ")
} }
s = stdhtml.UnescapeString(s) s = unescapeSnippetEntities(s)
s = stripSnippetCSSTail(s)
return strings.Join(strings.Fields(s), " ") return strings.Join(strings.Fields(s), " ")
} }
func unescapeSnippetEntities(s string) string {
for i := 0; i < 4; i++ {
prev := s
s = stdhtml.UnescapeString(s)
s = snippetHTMLEntityRE.ReplaceAllStringFunc(s, func(entity string) string {
return stdhtml.UnescapeString(entity)
})
if s == prev {
break
}
}
return s
}
// stripSnippetCSSTail removes trailing CSS comment junk often leaked into stored snippets.
func stripSnippetCSSTail(s string) string {
s = strings.TrimSpace(s)
for _, marker := range []string{"/*//", "/*", "//||"} {
if idx := strings.Index(s, marker); idx >= 0 {
head := strings.TrimSpace(s[:idx])
if len(head) >= 12 {
s = head
break
}
}
}
return strings.TrimRight(s, " /*-|_")
}
func hasUndecodedHTMLEntities(s string) bool {
return snippetHTMLEntityRE.MatchString(s)
}
func snippetCandidates(text, html string) []string { func snippetCandidates(text, html string) []string {
var out []string var out []string
text = strings.TrimSpace(stripPlainTextPreheaderPadding(text)) text = strings.TrimSpace(stripPlainTextPreheaderPadding(text))
@ -64,11 +111,13 @@ func splitSnippetSegments(s string) []string {
}) })
var segments []string var segments []string
for _, line := range raw { for _, line := range raw {
line = stripSnippetMarkup(line) for _, part := range snippetSeparatorRunRE.Split(line, -1) {
if line == "" { part = stripSnippetMarkup(part)
if part == "" {
continue continue
} }
segments = append(segments, line) segments = append(segments, part)
}
} }
return segments return segments
} }
@ -79,6 +128,7 @@ func htmlSnippetCandidates(raw string) []string {
return nil return nil
} }
raw = sanitize.StripHiddenEmailHTML(raw) raw = sanitize.StripHiddenEmailHTML(raw)
raw = stripStyleBlocksFromHTML(raw)
doc, err := html.Parse(strings.NewReader(raw)) doc, err := html.Parse(strings.NewReader(raw))
if err != nil { if err != nil {
if flat := strings.TrimSpace(stripHTMLForSnippet(raw)); flat != "" { if flat := strings.TrimSpace(stripHTMLForSnippet(raw)); flat != "" {
@ -164,6 +214,9 @@ func pickBestSnippetLine(candidates []string) string {
} }
func snippetLineScore(s string) int { func snippetLineScore(s string) int {
if isSnippetBoilerplate(s) {
return 0
}
letters := 0 letters := 0
for _, r := range s { for _, r := range s {
if unicode.IsLetter(r) { if unicode.IsLetter(r) {
@ -180,6 +233,9 @@ func snippetLineScore(s string) int {
if len(s) >= 280 { if len(s) >= 280 {
score += 10 score += 10
} }
if strings.Contains(strings.ToLower(s), "http://") || strings.Contains(strings.ToLower(s), "https://") {
score -= 30
}
return score return score
} }
@ -188,18 +244,17 @@ func isSnippetBoilerplate(s string) bool {
if s == "" || len(s) < 4 { if s == "" || len(s) < 4 {
return true return true
} }
lower := strings.ToLower(s)
if looksLikeCSSSnippet(s) { if looksLikeCSSSnippet(s) {
return true return true
} }
if isMostlySeparatorLine(s) { if isMostlySeparatorLine(s) || hasLeadingSeparatorRun(s) {
return true return true
} }
if isViewInBrowserSnippet(s) || isSnippetLegalFooter(s) {
return true
}
lower := strings.ToLower(s)
boilerplate := []string{ boilerplate := []string{
"afficher dans le navigateur",
"view in browser",
"view this email in your browser",
"voir ce message en ligne",
"si vous ne visualisez pas", "si vous ne visualisez pas",
"si vous n'arrivez pas à lire", "si vous n'arrivez pas à lire",
"si vous n'arrivez pas a lire", "si vous n'arrivez pas a lire",
@ -214,40 +269,117 @@ func isSnippetBoilerplate(s string) bool {
"gérer vos préférences", "gérer vos préférences",
} }
for _, phrase := range boilerplate { for _, phrase := range boilerplate {
if strings.Contains(lower, phrase) && len(s) < 160 { if strings.Contains(lower, phrase) && len(s) < 200 {
return true return true
} }
} }
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") { if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
return len(s) < 100 return len(s) < 120
}
if strings.HasPrefix(s, "[") && strings.HasSuffix(s, "]") && len(s) < 80 {
return true
} }
letterRatio := snippetLetterRatio(s) letterRatio := snippetLetterRatio(s)
return letterRatio < 0.35 return letterRatio < 0.35
} }
func normalizeSnippetMatchText(s string) string {
s = strings.ToLower(s)
s = strings.NewReplacer("[", " ", "]", " ", "(", " ", ")", " ", "·", " ").Replace(s)
return strings.Join(strings.Fields(s), " ")
}
func isViewInBrowserSnippet(s string) bool {
norm := normalizeSnippetMatchText(s)
phrases := []string{
"afficher dans le navigateur",
"view in browser",
"view this email in your browser",
"voir ce message en ligne",
"voir la version en ligne",
"version en ligne",
}
for _, phrase := range phrases {
if !strings.Contains(norm, phrase) {
continue
}
rest := strings.TrimSpace(strings.ReplaceAll(norm, phrase, ""))
if len(rest) <= 20 || len(s) <= 90 {
return true
}
}
return false
}
func isSnippetLegalFooter(s string) bool {
lower := strings.ToLower(s)
if !strings.Contains(lower, "http") && !strings.Contains(lower, "www.") {
return false
}
markers := []string{
"sas ", "sarl ", " sa ", " sas.", "rue ", " bp ", "rcs ",
"kellermann", "ovh.com", "www.ovh",
}
hits := 0
for _, m := range markers {
if strings.Contains(lower, m) {
hits++
}
}
if hits >= 2 {
return true
}
if strings.Contains(lower, "https://") && (strings.Contains(lower, "rue ") || strings.Contains(lower, " bp ")) {
return true
}
return strings.HasPrefix(lower, "sas ") && strings.Contains(lower, "http")
}
func hasLeadingSeparatorRun(s string) bool {
trimmed := strings.TrimSpace(s)
if len(trimmed) < 12 {
return false
}
run := 0
for _, r := range trimmed {
switch r {
case '-', '_', '*', '=', '·', '—':
run++
default:
goto done
}
}
done:
return run >= 12
}
func looksLikeCSSSnippet(s string) bool { func looksLikeCSSSnippet(s string) bool {
lower := strings.ToLower(s) lower := strings.ToLower(s)
if strings.Contains(lower, ":root") || if strings.Contains(lower, ":root") ||
strings.Contains(lower, "color-scheme:") || strings.Contains(lower, "color-scheme:") ||
strings.Contains(lower, "@media") || strings.Contains(lower, "@media") ||
strings.Contains(lower, "@font-face") ||
strings.Contains(lower, "font-family:") ||
strings.Contains(lower, "<!--") { strings.Contains(lower, "<!--") {
return true return true
} }
if strings.Contains(lower, "facebook") && strings.Contains(lower, ":root") {
return true
}
if strings.Contains(lower, "meta for business") && strings.Contains(s, "{") {
return true
}
if strings.HasPrefix(strings.TrimSpace(s), "/*") { if strings.HasPrefix(strings.TrimSpace(s), "/*") {
return true return true
} }
semis := strings.Count(s, ";") if strings.Contains(s, "/*//") || strings.Contains(s, "||//") || strings.Contains(s, "//||") {
braces := strings.Count(s, "{") + strings.Count(s, "}")
if braces >= 2 && semis >= 2 {
return true return true
} }
if strings.Contains(s, "{") && strings.Contains(s, "}") && semis := strings.Count(s, ";")
(strings.Contains(lower, "font-") || strings.Contains(lower, "margin:") || strings.Contains(lower, "padding:")) { braces := strings.Count(s, "{") + strings.Count(s, "}")
if braces >= 1 && semis >= 1 && strings.Contains(s, "{") && strings.Contains(s, "}") {
if strings.Contains(lower, "font-") || strings.Contains(lower, "margin:") ||
strings.Contains(lower, "padding:") || strings.Contains(lower, "font-family") {
return true return true
} }
}
return false return false
} }
@ -258,11 +390,15 @@ func isMostlySeparatorLine(s string) bool {
sep := 0 sep := 0
for _, r := range s { for _, r := range s {
switch r { switch r {
case '-', '_', '*', '=', '·', '—': case '-', '_', '*', '=', '·', '—', '|':
sep++ sep++
} }
} }
return float64(sep)/float64(len(s)) >= 0.6 runes := len([]rune(s))
if runes == 0 {
return false
}
return float64(sep)/float64(runes) >= 0.55
} }
func snippetLetterRatio(s string) float64 { func snippetLetterRatio(s string) float64 {
@ -284,5 +420,8 @@ func SnippetLooksLowQuality(snippet string) bool {
if snippet == "" { if snippet == "" {
return true return true
} }
return isSnippetBoilerplate(snippet) || looksLikeCSSSnippet(snippet) || snippetHTMLTagRE.MatchString(snippet) return isSnippetBoilerplate(snippet) ||
looksLikeCSSSnippet(snippet) ||
snippetHTMLTagRE.MatchString(snippet) ||
hasUndecodedHTMLEntities(snippet)
} }

View File

@ -32,6 +32,18 @@ func TestSnippetFromBodies_skipsViewInBrowser(t *testing.T) {
} }
} }
func TestSnippetFromBodies_skipsViewInBrowserBracketedStored(t *testing.T) {
stored := "[ Afficher dans le navigateur ]..."
html := `<html><body><p>Contenu utile sur le webinar matériaux.</p></body></html>`
got := RepairSnippetWithBodies(stored, "", html)
if strings.Contains(strings.ToLower(got), "afficher dans le navigateur") {
t.Fatalf("snippet = %q", got)
}
if !strings.Contains(got, "webinar") {
t.Fatalf("snippet = %q, want body content", got)
}
}
func TestSnippetFromBodies_skipsSeparatorLine(t *testing.T) { func TestSnippetFromBodies_skipsSeparatorLine(t *testing.T) {
text := "----------------------------------------------------------------\nUn festival rétro au Château de Tilloloy arrive cet été." text := "----------------------------------------------------------------\nUn festival rétro au Château de Tilloloy arrive cet été."
got := SnippetFromBodies(text, "", 200) got := SnippetFromBodies(text, "", 200)
@ -43,6 +55,42 @@ func TestSnippetFromBodies_skipsSeparatorLine(t *testing.T) {
} }
} }
func TestSnippetFromBodies_skipsSeparatorOnlyLine(t *testing.T) {
stored := "--------------------------------------- * U..."
html := `<html><body><p>Un festival rétro au Château de Tilloloy arrive cet été.</p></body></html>`
got := RepairSnippetWithBodies(stored, "", html)
if strings.Contains(got, "---") {
t.Fatalf("snippet = %q, want no separator line", got)
}
if !strings.Contains(got, "festival") {
t.Fatalf("snippet = %q", got)
}
}
func TestSnippetFromBodies_skipsOVHFooter(t *testing.T) {
stored := "SAS OVH - https://www.ovh.com/ 2 rue Kellermann BP 80157..."
html := `<html><body><p>Votre facture OVH est disponible dans l'espace client.</p></body></html>`
got := RepairSnippetWithBodies(stored, "", html)
if strings.Contains(got, "Kellermann") {
t.Fatalf("snippet = %q, want no legal footer", got)
}
if !strings.Contains(got, "facture") {
t.Fatalf("snippet = %q", got)
}
}
func TestSnippetFromBodies_skipsFontFaceCSS(t *testing.T) {
stored := "@font-face { font-family: 'Playfair Display'; font-style: normal;..."
html := `<html><body><p>Découvrez notre collection printemps.</p></body></html>`
got := RepairSnippetWithBodies(stored, "", html)
if strings.Contains(got, "@font-face") {
t.Fatalf("snippet = %q", got)
}
if !strings.Contains(got, "collection") {
t.Fatalf("snippet = %q", got)
}
}
func TestSnippetFromBodies_stripsHTMLTags(t *testing.T) { func TestSnippetFromBodies_stripsHTMLTags(t *testing.T) {
text := "<b>Bonjour</b> Eliott, votre <strong>commande</strong> est prête." text := "<b>Bonjour</b> Eliott, votre <strong>commande</strong> est prête."
got := SnippetFromBodies(text, "", 200) got := SnippetFromBodies(text, "", 200)
@ -65,6 +113,26 @@ func TestRepairSnippetWithBodies_stripsStoredHTMLTags(t *testing.T) {
} }
} }
func TestStripSnippetMarkup_decodesEntitiesAndCSSTail(t *testing.T) {
raw := "Victoria vient d&#39;activer son compte sur passbolt /*//||//..."
got := stripSnippetMarkup(raw)
if strings.Contains(got, "&#39;") || strings.Contains(got, "&apos;") {
t.Fatalf("snippet = %q, want decoded apostrophe", got)
}
if !strings.Contains(got, "d'activer") {
t.Fatalf("snippet = %q, want apostrophe", got)
}
if strings.Contains(got, "/*") || strings.Contains(got, "||//") {
t.Fatalf("snippet = %q, want CSS tail removed", got)
}
}
func TestSnippetLooksLowQuality_encodedEntities(t *testing.T) {
if !SnippetLooksLowQuality("Hello d&#39;activer") {
t.Fatal("expected encoded entities to be low quality")
}
}
func TestRepairSnippetWithBodies_replacesCSSPreview(t *testing.T) { func TestRepairSnippetWithBodies_replacesCSSPreview(t *testing.T) {
stored := "FacebookMeta for Business :root { color-scheme: light dark;" stored := "FacebookMeta for Business :root { color-scheme: light dark;"
html := `<html><body><p>Inclure automatiquement des informations plus détaillées sur le compte.</p></body></html>` html := `<html><body><p>Inclure automatiquement des informations plus détaillées sur le compte.</p></body></html>`
@ -76,3 +144,18 @@ func TestRepairSnippetWithBodies_replacesCSSPreview(t *testing.T) {
t.Fatalf("snippet = %q, want rebuilt from html", got) t.Fatalf("snippet = %q, want rebuilt from html", got)
} }
} }
func TestIsSnippetBoilerplate_userReportedCases(t *testing.T) {
cases := []string{
"--------------------------------------- * U...",
"[ Afficher dans le navigateur ]...",
"SAS OVH - https://www.ovh.com/ 2 rue Kellermann BP 80157...",
"FacebookMeta for Business :root { Color-scheme: light dark;...",
"@font-face { font-family: 'Playfair Display'; font-style: normal;...",
}
for _, c := range cases {
if !isSnippetBoilerplate(c) {
t.Fatalf("expected boilerplate for %q", c)
}
}
}