package imap import ( stdhtml "html" "regexp" "strings" "unicode" "golang.org/x/net/html" "github.com/ultisuite/ulti-backend/internal/mail/sanitize" ) var ( snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`) snippetHTMLEntityRE = regexp.MustCompile(`(?i)(?:&#x?[0-9a-f]+;|&[a-z][a-z0-9]{1,8};)`) snippetStyleBlockRE = regexp.MustCompile(`(?is)]*>.*?`) snippetSeparatorRunRE = regexp.MustCompile(`-{8,}|_{8,}|={8,}|\*{8,}`) ) var snippetSkipTags = map[string]bool{ "script": true, "style": true, "head": true, "noscript": true, "meta": true, "link": true, "title": true, "svg": true, } var snippetBlockTags = map[string]bool{ "p": true, "li": true, "td": true, "th": true, "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true, "div": true, "span": true, "a": true, } // SnippetFromBodies builds a short list-preview from plain and HTML bodies. func SnippetFromBodies(text, html string, maxLen int) string { candidates := snippetCandidates(text, html) best := pickBestSnippetLine(candidates) if best == "" && strings.TrimSpace(html) != "" { flat := stripSnippetMarkup(stripHTMLForSnippet(stripStyleBlocksFromHTML(html))) best = pickBestSnippetLine(splitSnippetSegments(flat)) } if best == "" { return "" } return truncate(stripSnippetMarkup(best), maxLen) } func stripStyleBlocksFromHTML(html string) string { return snippetStyleBlockRE.ReplaceAllString(html, " ") } // stripSnippetMarkup removes HTML tags, entities, and CSS noise from preview text. func stripSnippetMarkup(s string) string { s = strings.TrimSpace(s) if s == "" { return "" } if snippetHTMLTagRE.MatchString(s) { s = snippetHTMLTagRE.ReplaceAllString(s, " ") } s = unescapeSnippetEntities(s) s = stripSnippetCSSTail(s) return strings.Join(strings.Fields(s), " ") } func unescapeSnippetEntities(s string) string { for i := 0; i < 4; i++ { prev := s s = stdhtml.UnescapeString(s) s = snippetHTMLEntityRE.ReplaceAllStringFunc(s, func(entity string) string { return stdhtml.UnescapeString(entity) }) if s == prev { break } } return s } // stripSnippetCSSTail removes trailing CSS comment junk often leaked into stored snippets. func stripSnippetCSSTail(s string) string { s = strings.TrimSpace(s) for _, marker := range []string{"/*//", "/*", "//||"} { if idx := strings.Index(s, marker); idx >= 0 { head := strings.TrimSpace(s[:idx]) if len(head) >= 12 { s = head break } } } return strings.TrimRight(s, " /*-|_") } func hasUndecodedHTMLEntities(s string) bool { return snippetHTMLEntityRE.MatchString(s) } func snippetCandidates(text, html string) []string { var out []string text = strings.TrimSpace(stripPlainTextPreheaderPadding(text)) if text != "" { out = append(out, splitSnippetSegments(text)...) } out = append(out, htmlSnippetCandidates(html)...) return out } func splitSnippetSegments(s string) []string { s = strings.ReplaceAll(s, "\r\n", "\n") raw := strings.FieldsFunc(s, func(r rune) bool { return r == '\n' }) var segments []string for _, line := range raw { for _, part := range snippetSeparatorRunRE.Split(line, -1) { part = stripSnippetMarkup(part) if part == "" { continue } segments = append(segments, part) } } return segments } func htmlSnippetCandidates(raw string) []string { raw = strings.TrimSpace(raw) if raw == "" { return nil } raw = sanitize.StripHiddenEmailHTML(raw) raw = stripStyleBlocksFromHTML(raw) doc, err := html.Parse(strings.NewReader(raw)) if err != nil { if flat := strings.TrimSpace(stripHTMLForSnippet(raw)); flat != "" { return splitSnippetSegments(flat) } return nil } seen := make(map[string]struct{}) var candidates []string add := func(s string) { s = stripSnippetMarkup(s) if s == "" { return } if _, ok := seen[s]; ok { return } seen[s] = struct{}{} candidates = append(candidates, s) } var walk func(*html.Node) walk = func(n *html.Node) { if n.Type == html.ElementNode { tag := strings.ToLower(n.Data) if snippetSkipTags[tag] { return } if snippetBlockTags[tag] { add(textFromHTMLSubtree(n)) return } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) if len(candidates) == 0 { add(textFromHTMLSubtree(doc)) } return candidates } func textFromHTMLSubtree(n *html.Node) string { var buf strings.Builder var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode && snippetSkipTags[strings.ToLower(node.Data)] { return } if node.Type == html.TextNode { t := strings.TrimSpace(node.Data) if t != "" { if buf.Len() > 0 { buf.WriteRune(' ') } buf.WriteString(t) } } for c := node.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(n) return strings.TrimSpace(buf.String()) } func pickBestSnippetLine(candidates []string) string { var best string bestScore := -1 for _, c := range candidates { c = stripSnippetMarkup(c) if c == "" || isSnippetBoilerplate(c) { continue } score := snippetLineScore(c) if score > bestScore { bestScore = score best = c } } return best } func snippetLineScore(s string) int { if isSnippetBoilerplate(s) { return 0 } letters := 0 for _, r := range s { if unicode.IsLetter(r) { letters++ } } if letters < 8 { return 0 } score := letters * 4 if len(s) > 40 && len(s) < 280 { score += 40 } if len(s) >= 280 { score += 10 } if strings.Contains(strings.ToLower(s), "http://") || strings.Contains(strings.ToLower(s), "https://") { score -= 30 } return score } func isSnippetBoilerplate(s string) bool { s = strings.TrimSpace(s) if s == "" || len(s) < 4 { return true } if looksLikeCSSSnippet(s) { return true } if isMostlySeparatorLine(s) || hasLeadingSeparatorRun(s) { return true } if isViewInBrowserSnippet(s) || isSnippetLegalFooter(s) { return true } lower := strings.ToLower(s) boilerplate := []string{ "si vous ne visualisez pas", "si vous n'arrivez pas à lire", "si vous n'arrivez pas a lire", "problems viewing this email", "having trouble viewing", "cliquer ici", "click here", "unsubscribe", "se désabonner", "se desabonner", "manage your preferences", "gérer vos préférences", } for _, phrase := range boilerplate { if strings.Contains(lower, phrase) && len(s) < 200 { return true } } if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") { return len(s) < 120 } letterRatio := snippetLetterRatio(s) return letterRatio < 0.35 } func normalizeSnippetMatchText(s string) string { s = strings.ToLower(s) s = strings.NewReplacer("[", " ", "]", " ", "(", " ", ")", " ", "·", " ").Replace(s) return strings.Join(strings.Fields(s), " ") } func isViewInBrowserSnippet(s string) bool { norm := normalizeSnippetMatchText(s) phrases := []string{ "afficher dans le navigateur", "view in browser", "view this email in your browser", "voir ce message en ligne", "voir la version en ligne", "version en ligne", } for _, phrase := range phrases { if !strings.Contains(norm, phrase) { continue } rest := strings.TrimSpace(strings.ReplaceAll(norm, phrase, "")) if len(rest) <= 20 || len(s) <= 90 { return true } } return false } func isSnippetLegalFooter(s string) bool { lower := strings.ToLower(s) if !strings.Contains(lower, "http") && !strings.Contains(lower, "www.") { return false } markers := []string{ "sas ", "sarl ", " sa ", " sas.", "rue ", " bp ", "rcs ", "kellermann", "ovh.com", "www.ovh", } hits := 0 for _, m := range markers { if strings.Contains(lower, m) { hits++ } } if hits >= 2 { return true } if strings.Contains(lower, "https://") && (strings.Contains(lower, "rue ") || strings.Contains(lower, " bp ")) { return true } return strings.HasPrefix(lower, "sas ") && strings.Contains(lower, "http") } func hasLeadingSeparatorRun(s string) bool { trimmed := strings.TrimSpace(s) if len(trimmed) < 12 { return false } run := 0 for _, r := range trimmed { switch r { case '-', '_', '*', '=', '·', '—': run++ default: goto done } } done: return run >= 12 } func looksLikeCSSSnippet(s string) bool { lower := strings.ToLower(s) if strings.Contains(lower, ":root") || strings.Contains(lower, "color-scheme:") || strings.Contains(lower, "@media") || strings.Contains(lower, "@font-face") || strings.Contains(lower, "font-family:") || strings.Contains(lower, "