- Improved the `RepairSnippetWithBodies` function to streamline snippet rebuilding logic and reduce redundancy. - Introduced new utility functions for stripping CSS noise and decoding HTML entities in snippets. - Enhanced boilerplate detection to better identify low-quality snippets, including legal footers and view-in-browser prompts. - Added comprehensive tests for new functionality and edge cases in snippet processing.
428 lines
10 KiB
Go
428 lines
10 KiB
Go
package imap
|
|
|
|
import (
|
|
stdhtml "html"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"github.com/ultisuite/ulti-backend/internal/mail/sanitize"
|
|
)
|
|
|
|
var (
|
|
snippetHTMLTagRE = regexp.MustCompile(`(?is)<[^>]*>`)
|
|
snippetHTMLEntityRE = regexp.MustCompile(`(?i)(?:&#x?[0-9a-f]+;|&[a-z][a-z0-9]{1,8};)`)
|
|
snippetStyleBlockRE = regexp.MustCompile(`(?is)<style[^>]*>.*?</style>`)
|
|
snippetSeparatorRunRE = regexp.MustCompile(`-{8,}|_{8,}|={8,}|\*{8,}`)
|
|
)
|
|
|
|
var snippetSkipTags = map[string]bool{
|
|
"script": true, "style": true, "head": true, "noscript": true,
|
|
"meta": true, "link": true, "title": true, "svg": true,
|
|
}
|
|
|
|
var snippetBlockTags = map[string]bool{
|
|
"p": true, "li": true, "td": true, "th": true,
|
|
"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
|
|
"div": true, "span": true, "a": true,
|
|
}
|
|
|
|
// SnippetFromBodies builds a short list-preview from plain and HTML bodies.
|
|
func SnippetFromBodies(text, html string, maxLen int) string {
|
|
candidates := snippetCandidates(text, html)
|
|
best := pickBestSnippetLine(candidates)
|
|
if best == "" && strings.TrimSpace(html) != "" {
|
|
flat := stripSnippetMarkup(stripHTMLForSnippet(stripStyleBlocksFromHTML(html)))
|
|
best = pickBestSnippetLine(splitSnippetSegments(flat))
|
|
}
|
|
if best == "" {
|
|
return ""
|
|
}
|
|
return truncate(stripSnippetMarkup(best), maxLen)
|
|
}
|
|
|
|
func stripStyleBlocksFromHTML(html string) string {
|
|
return snippetStyleBlockRE.ReplaceAllString(html, " ")
|
|
}
|
|
|
|
// stripSnippetMarkup removes HTML tags, entities, and CSS noise from preview text.
|
|
func stripSnippetMarkup(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
if s == "" {
|
|
return ""
|
|
}
|
|
if snippetHTMLTagRE.MatchString(s) {
|
|
s = snippetHTMLTagRE.ReplaceAllString(s, " ")
|
|
}
|
|
s = unescapeSnippetEntities(s)
|
|
s = stripSnippetCSSTail(s)
|
|
return strings.Join(strings.Fields(s), " ")
|
|
}
|
|
|
|
func unescapeSnippetEntities(s string) string {
|
|
for i := 0; i < 4; i++ {
|
|
prev := s
|
|
s = stdhtml.UnescapeString(s)
|
|
s = snippetHTMLEntityRE.ReplaceAllStringFunc(s, func(entity string) string {
|
|
return stdhtml.UnescapeString(entity)
|
|
})
|
|
if s == prev {
|
|
break
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
// stripSnippetCSSTail removes trailing CSS comment junk often leaked into stored snippets.
|
|
func stripSnippetCSSTail(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
for _, marker := range []string{"/*//", "/*", "//||"} {
|
|
if idx := strings.Index(s, marker); idx >= 0 {
|
|
head := strings.TrimSpace(s[:idx])
|
|
if len(head) >= 12 {
|
|
s = head
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return strings.TrimRight(s, " /*-|_")
|
|
}
|
|
|
|
func hasUndecodedHTMLEntities(s string) bool {
|
|
return snippetHTMLEntityRE.MatchString(s)
|
|
}
|
|
|
|
func snippetCandidates(text, html string) []string {
|
|
var out []string
|
|
text = strings.TrimSpace(stripPlainTextPreheaderPadding(text))
|
|
if text != "" {
|
|
out = append(out, splitSnippetSegments(text)...)
|
|
}
|
|
out = append(out, htmlSnippetCandidates(html)...)
|
|
return out
|
|
}
|
|
|
|
func splitSnippetSegments(s string) []string {
|
|
s = strings.ReplaceAll(s, "\r\n", "\n")
|
|
raw := strings.FieldsFunc(s, func(r rune) bool {
|
|
return r == '\n'
|
|
})
|
|
var segments []string
|
|
for _, line := range raw {
|
|
for _, part := range snippetSeparatorRunRE.Split(line, -1) {
|
|
part = stripSnippetMarkup(part)
|
|
if part == "" {
|
|
continue
|
|
}
|
|
segments = append(segments, part)
|
|
}
|
|
}
|
|
return segments
|
|
}
|
|
|
|
func htmlSnippetCandidates(raw string) []string {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return nil
|
|
}
|
|
raw = sanitize.StripHiddenEmailHTML(raw)
|
|
raw = stripStyleBlocksFromHTML(raw)
|
|
doc, err := html.Parse(strings.NewReader(raw))
|
|
if err != nil {
|
|
if flat := strings.TrimSpace(stripHTMLForSnippet(raw)); flat != "" {
|
|
return splitSnippetSegments(flat)
|
|
}
|
|
return nil
|
|
}
|
|
seen := make(map[string]struct{})
|
|
var candidates []string
|
|
add := func(s string) {
|
|
s = stripSnippetMarkup(s)
|
|
if s == "" {
|
|
return
|
|
}
|
|
if _, ok := seen[s]; ok {
|
|
return
|
|
}
|
|
seen[s] = struct{}{}
|
|
candidates = append(candidates, s)
|
|
}
|
|
var walk func(*html.Node)
|
|
walk = func(n *html.Node) {
|
|
if n.Type == html.ElementNode {
|
|
tag := strings.ToLower(n.Data)
|
|
if snippetSkipTags[tag] {
|
|
return
|
|
}
|
|
if snippetBlockTags[tag] {
|
|
add(textFromHTMLSubtree(n))
|
|
return
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(doc)
|
|
if len(candidates) == 0 {
|
|
add(textFromHTMLSubtree(doc))
|
|
}
|
|
return candidates
|
|
}
|
|
|
|
func textFromHTMLSubtree(n *html.Node) string {
|
|
var buf strings.Builder
|
|
var walk func(*html.Node)
|
|
walk = func(node *html.Node) {
|
|
if node.Type == html.ElementNode && snippetSkipTags[strings.ToLower(node.Data)] {
|
|
return
|
|
}
|
|
if node.Type == html.TextNode {
|
|
t := strings.TrimSpace(node.Data)
|
|
if t != "" {
|
|
if buf.Len() > 0 {
|
|
buf.WriteRune(' ')
|
|
}
|
|
buf.WriteString(t)
|
|
}
|
|
}
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(n)
|
|
return strings.TrimSpace(buf.String())
|
|
}
|
|
|
|
func pickBestSnippetLine(candidates []string) string {
|
|
var best string
|
|
bestScore := -1
|
|
for _, c := range candidates {
|
|
c = stripSnippetMarkup(c)
|
|
if c == "" || isSnippetBoilerplate(c) {
|
|
continue
|
|
}
|
|
score := snippetLineScore(c)
|
|
if score > bestScore {
|
|
bestScore = score
|
|
best = c
|
|
}
|
|
}
|
|
return best
|
|
}
|
|
|
|
func snippetLineScore(s string) int {
|
|
if isSnippetBoilerplate(s) {
|
|
return 0
|
|
}
|
|
letters := 0
|
|
for _, r := range s {
|
|
if unicode.IsLetter(r) {
|
|
letters++
|
|
}
|
|
}
|
|
if letters < 8 {
|
|
return 0
|
|
}
|
|
score := letters * 4
|
|
if len(s) > 40 && len(s) < 280 {
|
|
score += 40
|
|
}
|
|
if len(s) >= 280 {
|
|
score += 10
|
|
}
|
|
if strings.Contains(strings.ToLower(s), "http://") || strings.Contains(strings.ToLower(s), "https://") {
|
|
score -= 30
|
|
}
|
|
return score
|
|
}
|
|
|
|
func isSnippetBoilerplate(s string) bool {
|
|
s = strings.TrimSpace(s)
|
|
if s == "" || len(s) < 4 {
|
|
return true
|
|
}
|
|
if looksLikeCSSSnippet(s) {
|
|
return true
|
|
}
|
|
if isMostlySeparatorLine(s) || hasLeadingSeparatorRun(s) {
|
|
return true
|
|
}
|
|
if isViewInBrowserSnippet(s) || isSnippetLegalFooter(s) {
|
|
return true
|
|
}
|
|
lower := strings.ToLower(s)
|
|
boilerplate := []string{
|
|
"si vous ne visualisez pas",
|
|
"si vous n'arrivez pas à lire",
|
|
"si vous n'arrivez pas a lire",
|
|
"problems viewing this email",
|
|
"having trouble viewing",
|
|
"cliquer ici",
|
|
"click here",
|
|
"unsubscribe",
|
|
"se désabonner",
|
|
"se desabonner",
|
|
"manage your preferences",
|
|
"gérer vos préférences",
|
|
}
|
|
for _, phrase := range boilerplate {
|
|
if strings.Contains(lower, phrase) && len(s) < 200 {
|
|
return true
|
|
}
|
|
}
|
|
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
|
|
return len(s) < 120
|
|
}
|
|
letterRatio := snippetLetterRatio(s)
|
|
return letterRatio < 0.35
|
|
}
|
|
|
|
func normalizeSnippetMatchText(s string) string {
|
|
s = strings.ToLower(s)
|
|
s = strings.NewReplacer("[", " ", "]", " ", "(", " ", ")", " ", "·", " ").Replace(s)
|
|
return strings.Join(strings.Fields(s), " ")
|
|
}
|
|
|
|
func isViewInBrowserSnippet(s string) bool {
|
|
norm := normalizeSnippetMatchText(s)
|
|
phrases := []string{
|
|
"afficher dans le navigateur",
|
|
"view in browser",
|
|
"view this email in your browser",
|
|
"voir ce message en ligne",
|
|
"voir la version en ligne",
|
|
"version en ligne",
|
|
}
|
|
for _, phrase := range phrases {
|
|
if !strings.Contains(norm, phrase) {
|
|
continue
|
|
}
|
|
rest := strings.TrimSpace(strings.ReplaceAll(norm, phrase, ""))
|
|
if len(rest) <= 20 || len(s) <= 90 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isSnippetLegalFooter(s string) bool {
|
|
lower := strings.ToLower(s)
|
|
if !strings.Contains(lower, "http") && !strings.Contains(lower, "www.") {
|
|
return false
|
|
}
|
|
markers := []string{
|
|
"sas ", "sarl ", " sa ", " sas.", "rue ", " bp ", "rcs ",
|
|
"kellermann", "ovh.com", "www.ovh",
|
|
}
|
|
hits := 0
|
|
for _, m := range markers {
|
|
if strings.Contains(lower, m) {
|
|
hits++
|
|
}
|
|
}
|
|
if hits >= 2 {
|
|
return true
|
|
}
|
|
if strings.Contains(lower, "https://") && (strings.Contains(lower, "rue ") || strings.Contains(lower, " bp ")) {
|
|
return true
|
|
}
|
|
return strings.HasPrefix(lower, "sas ") && strings.Contains(lower, "http")
|
|
}
|
|
|
|
func hasLeadingSeparatorRun(s string) bool {
|
|
trimmed := strings.TrimSpace(s)
|
|
if len(trimmed) < 12 {
|
|
return false
|
|
}
|
|
run := 0
|
|
for _, r := range trimmed {
|
|
switch r {
|
|
case '-', '_', '*', '=', '·', '—':
|
|
run++
|
|
default:
|
|
goto done
|
|
}
|
|
}
|
|
done:
|
|
return run >= 12
|
|
}
|
|
|
|
func looksLikeCSSSnippet(s string) bool {
|
|
lower := strings.ToLower(s)
|
|
if strings.Contains(lower, ":root") ||
|
|
strings.Contains(lower, "color-scheme:") ||
|
|
strings.Contains(lower, "@media") ||
|
|
strings.Contains(lower, "@font-face") ||
|
|
strings.Contains(lower, "font-family:") ||
|
|
strings.Contains(lower, "<!--") {
|
|
return true
|
|
}
|
|
if strings.Contains(lower, "facebook") && strings.Contains(lower, ":root") {
|
|
return true
|
|
}
|
|
if strings.Contains(lower, "meta for business") && strings.Contains(s, "{") {
|
|
return true
|
|
}
|
|
if strings.HasPrefix(strings.TrimSpace(s), "/*") {
|
|
return true
|
|
}
|
|
if strings.Contains(s, "/*//") || strings.Contains(s, "||//") || strings.Contains(s, "//||") {
|
|
return true
|
|
}
|
|
semis := strings.Count(s, ";")
|
|
braces := strings.Count(s, "{") + strings.Count(s, "}")
|
|
if braces >= 1 && semis >= 1 && strings.Contains(s, "{") && strings.Contains(s, "}") {
|
|
if strings.Contains(lower, "font-") || strings.Contains(lower, "margin:") ||
|
|
strings.Contains(lower, "padding:") || strings.Contains(lower, "font-family") {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isMostlySeparatorLine(s string) bool {
|
|
if len(s) < 8 {
|
|
return false
|
|
}
|
|
sep := 0
|
|
for _, r := range s {
|
|
switch r {
|
|
case '-', '_', '*', '=', '·', '—', '|':
|
|
sep++
|
|
}
|
|
}
|
|
runes := len([]rune(s))
|
|
if runes == 0 {
|
|
return false
|
|
}
|
|
return float64(sep)/float64(runes) >= 0.55
|
|
}
|
|
|
|
func snippetLetterRatio(s string) float64 {
|
|
if len(s) == 0 {
|
|
return 0
|
|
}
|
|
letters := 0
|
|
for _, r := range s {
|
|
if unicode.IsLetter(r) || unicode.IsNumber(r) {
|
|
letters++
|
|
}
|
|
}
|
|
return float64(letters) / float64(len([]rune(s)))
|
|
}
|
|
|
|
// SnippetLooksLowQuality reports whether a stored snippet should be recomputed from bodies.
|
|
func SnippetLooksLowQuality(snippet string) bool {
|
|
snippet = strings.TrimSpace(snippet)
|
|
if snippet == "" {
|
|
return true
|
|
}
|
|
return isSnippetBoilerplate(snippet) ||
|
|
looksLikeCSSSnippet(snippet) ||
|
|
snippetHTMLTagRE.MatchString(snippet) ||
|
|
hasUndecodedHTMLEntities(snippet)
|
|
}
|