package imap
import (
stdhtml "html"
"regexp"
"strings"
"unicode"
"golang.org/x/net/html"
"github.com/ultisuite/ulti-backend/internal/mail/sanitize"
)
var (
snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`)
snippetHTMLEntityRE = regexp.MustCompile(`(?i)(?:?[0-9a-f]+;|&[a-z][a-z0-9]{1,8};)`)
snippetStyleBlockRE = regexp.MustCompile(`(?is)`)
snippetSeparatorRunRE = regexp.MustCompile(`-{8,}|_{8,}|={8,}|\*{8,}`)
)
var snippetSkipTags = map[string]bool{
"script": true, "style": true, "head": true, "noscript": true,
"meta": true, "link": true, "title": true, "svg": true,
}
var snippetBlockTags = map[string]bool{
"p": true, "li": true, "td": true, "th": true,
"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
"div": true, "span": true, "a": true,
}
// SnippetFromBodies builds a short list-preview from plain and HTML bodies.
func SnippetFromBodies(text, html string, maxLen int) string {
candidates := snippetCandidates(text, html)
best := pickBestSnippetLine(candidates)
if best == "" && strings.TrimSpace(html) != "" {
flat := stripSnippetMarkup(stripHTMLForSnippet(stripStyleBlocksFromHTML(html)))
best = pickBestSnippetLine(splitSnippetSegments(flat))
}
if best == "" {
return ""
}
return truncate(stripSnippetMarkup(best), maxLen)
}
func stripStyleBlocksFromHTML(html string) string {
return snippetStyleBlockRE.ReplaceAllString(html, " ")
}
// stripSnippetMarkup removes HTML tags, entities, and CSS noise from preview text.
func stripSnippetMarkup(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if snippetHTMLTagRE.MatchString(s) {
s = snippetHTMLTagRE.ReplaceAllString(s, " ")
}
s = unescapeSnippetEntities(s)
s = stripSnippetCSSTail(s)
return strings.Join(strings.Fields(s), " ")
}
func unescapeSnippetEntities(s string) string {
for i := 0; i < 4; i++ {
prev := s
s = stdhtml.UnescapeString(s)
s = snippetHTMLEntityRE.ReplaceAllStringFunc(s, func(entity string) string {
return stdhtml.UnescapeString(entity)
})
if s == prev {
break
}
}
return s
}
// stripSnippetCSSTail removes trailing CSS comment junk often leaked into stored snippets.
func stripSnippetCSSTail(s string) string {
s = strings.TrimSpace(s)
for _, marker := range []string{"/*//", "/*", "//||"} {
if idx := strings.Index(s, marker); idx >= 0 {
head := strings.TrimSpace(s[:idx])
if len(head) >= 12 {
s = head
break
}
}
}
return strings.TrimRight(s, " /*-|_")
}
func hasUndecodedHTMLEntities(s string) bool {
return snippetHTMLEntityRE.MatchString(s)
}
func snippetCandidates(text, html string) []string {
var out []string
text = strings.TrimSpace(stripPlainTextPreheaderPadding(text))
if text != "" {
out = append(out, splitSnippetSegments(text)...)
}
out = append(out, htmlSnippetCandidates(html)...)
return out
}
func splitSnippetSegments(s string) []string {
s = strings.ReplaceAll(s, "\r\n", "\n")
raw := strings.FieldsFunc(s, func(r rune) bool {
return r == '\n'
})
var segments []string
for _, line := range raw {
for _, part := range snippetSeparatorRunRE.Split(line, -1) {
part = stripSnippetMarkup(part)
if part == "" {
continue
}
segments = append(segments, part)
}
}
return segments
}
func htmlSnippetCandidates(raw string) []string {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil
}
raw = sanitize.StripHiddenEmailHTML(raw)
raw = stripStyleBlocksFromHTML(raw)
doc, err := html.Parse(strings.NewReader(raw))
if err != nil {
if flat := strings.TrimSpace(stripHTMLForSnippet(raw)); flat != "" {
return splitSnippetSegments(flat)
}
return nil
}
seen := make(map[string]struct{})
var candidates []string
add := func(s string) {
s = stripSnippetMarkup(s)
if s == "" {
return
}
if _, ok := seen[s]; ok {
return
}
seen[s] = struct{}{}
candidates = append(candidates, s)
}
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
tag := strings.ToLower(n.Data)
if snippetSkipTags[tag] {
return
}
if snippetBlockTags[tag] {
add(textFromHTMLSubtree(n))
return
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
if len(candidates) == 0 {
add(textFromHTMLSubtree(doc))
}
return candidates
}
func textFromHTMLSubtree(n *html.Node) string {
var buf strings.Builder
var walk func(*html.Node)
walk = func(node *html.Node) {
if node.Type == html.ElementNode && snippetSkipTags[strings.ToLower(node.Data)] {
return
}
if node.Type == html.TextNode {
t := strings.TrimSpace(node.Data)
if t != "" {
if buf.Len() > 0 {
buf.WriteRune(' ')
}
buf.WriteString(t)
}
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(n)
return strings.TrimSpace(buf.String())
}
func pickBestSnippetLine(candidates []string) string {
var best string
bestScore := -1
for _, c := range candidates {
c = stripSnippetMarkup(c)
if c == "" || isSnippetBoilerplate(c) {
continue
}
score := snippetLineScore(c)
if score > bestScore {
bestScore = score
best = c
}
}
return best
}
func snippetLineScore(s string) int {
if isSnippetBoilerplate(s) {
return 0
}
letters := 0
for _, r := range s {
if unicode.IsLetter(r) {
letters++
}
}
if letters < 8 {
return 0
}
score := letters * 4
if len(s) > 40 && len(s) < 280 {
score += 40
}
if len(s) >= 280 {
score += 10
}
if strings.Contains(strings.ToLower(s), "http://") || strings.Contains(strings.ToLower(s), "https://") {
score -= 30
}
return score
}
func isSnippetBoilerplate(s string) bool {
s = strings.TrimSpace(s)
if s == "" || len(s) < 4 {
return true
}
if looksLikeCSSSnippet(s) {
return true
}
if isMostlySeparatorLine(s) || hasLeadingSeparatorRun(s) {
return true
}
if isViewInBrowserSnippet(s) || isSnippetLegalFooter(s) {
return true
}
lower := strings.ToLower(s)
boilerplate := []string{
"si vous ne visualisez pas",
"si vous n'arrivez pas à lire",
"si vous n'arrivez pas a lire",
"problems viewing this email",
"having trouble viewing",
"cliquer ici",
"click here",
"unsubscribe",
"se désabonner",
"se desabonner",
"manage your preferences",
"gérer vos préférences",
}
for _, phrase := range boilerplate {
if strings.Contains(lower, phrase) && len(s) < 200 {
return true
}
}
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
return len(s) < 120
}
letterRatio := snippetLetterRatio(s)
return letterRatio < 0.35
}
func normalizeSnippetMatchText(s string) string {
s = strings.ToLower(s)
s = strings.NewReplacer("[", " ", "]", " ", "(", " ", ")", " ", "·", " ").Replace(s)
return strings.Join(strings.Fields(s), " ")
}
func isViewInBrowserSnippet(s string) bool {
norm := normalizeSnippetMatchText(s)
phrases := []string{
"afficher dans le navigateur",
"view in browser",
"view this email in your browser",
"voir ce message en ligne",
"voir la version en ligne",
"version en ligne",
}
for _, phrase := range phrases {
if !strings.Contains(norm, phrase) {
continue
}
rest := strings.TrimSpace(strings.ReplaceAll(norm, phrase, ""))
if len(rest) <= 20 || len(s) <= 90 {
return true
}
}
return false
}
func isSnippetLegalFooter(s string) bool {
lower := strings.ToLower(s)
if !strings.Contains(lower, "http") && !strings.Contains(lower, "www.") {
return false
}
markers := []string{
"sas ", "sarl ", " sa ", " sas.", "rue ", " bp ", "rcs ",
"kellermann", "ovh.com", "www.ovh",
}
hits := 0
for _, m := range markers {
if strings.Contains(lower, m) {
hits++
}
}
if hits >= 2 {
return true
}
if strings.Contains(lower, "https://") && (strings.Contains(lower, "rue ") || strings.Contains(lower, " bp ")) {
return true
}
return strings.HasPrefix(lower, "sas ") && strings.Contains(lower, "http")
}
func hasLeadingSeparatorRun(s string) bool {
trimmed := strings.TrimSpace(s)
if len(trimmed) < 12 {
return false
}
run := 0
for _, r := range trimmed {
switch r {
case '-', '_', '*', '=', '·', '—':
run++
default:
goto done
}
}
done:
return run >= 12
}
func looksLikeCSSSnippet(s string) bool {
lower := strings.ToLower(s)
if strings.Contains(lower, ":root") ||
strings.Contains(lower, "color-scheme:") ||
strings.Contains(lower, "@media") ||
strings.Contains(lower, "@font-face") ||
strings.Contains(lower, "font-family:") ||
strings.Contains(lower, "