ultisuite-backend/internal/mail/sanitize/preheader.go
2026-06-04 00:12:11 +02:00

148 lines
3.8 KiB
Go

package sanitize
import (
"bytes"
"regexp"
"strings"
"golang.org/x/net/html"
)
// StripHiddenEmailHTML removes invisible preheader / preview blocks common in marketing mail.
// Must run before bluemonday, which strips display:none styles and would expose padding text.
func StripHiddenEmailHTML(raw string) string {
if raw == "" {
return raw
}
doc, err := html.Parse(strings.NewReader(raw))
if err != nil {
return stripHiddenEmailHTMLRegex(raw)
}
var remove []*html.Node
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && shouldStripHiddenElement(n) {
remove = append(remove, n)
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
for _, n := range remove {
if n.Parent != nil {
n.Parent.RemoveChild(n)
}
}
var buf bytes.Buffer
if err := html.Render(&buf, doc); err != nil {
return stripHiddenEmailHTMLRegex(raw)
}
return buf.String()
}
func shouldStripHiddenElement(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
if attrVal(n, "hidden") != "" {
return true
}
if strings.EqualFold(attrVal(n, "aria-hidden"), "true") {
return true
}
class := strings.ToLower(attrVal(n, "class"))
if strings.Contains(class, "mcnpreviewtext") ||
strings.Contains(class, "preheader") ||
strings.Contains(class, "preview-text") {
return true
}
style := strings.ToLower(attrVal(n, "style"))
if style == "" {
return false
}
styleCompact := strings.ReplaceAll(style, " ", "")
if strings.Contains(styleCompact, "display:none") ||
strings.Contains(styleCompact, "mso-hide:all") ||
strings.Contains(styleCompact, "max-height:0") ||
strings.Contains(styleCompact, "opacity:0") ||
strings.Contains(styleCompact, "visibility:hidden") ||
strings.Contains(styleCompact, "overflow:hidden") && strings.Contains(styleCompact, "max-height:0") {
return true
}
if strings.Contains(styleCompact, "font-size:0") && !hasSignificantChildElements(n) {
return true
}
return false
}
// hasSignificantChildElements returns true when n contains child elements
// beyond trivial void elements (br, wbr, hr). Parents with font-size:0 that
// contain real child elements are layout wrappers (inline-block whitespace
// collapse), not hidden preheader text.
func hasSignificantChildElements(n *html.Node) bool {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type != html.ElementNode {
continue
}
tag := strings.ToLower(c.Data)
if tag == "br" || tag == "wbr" || tag == "hr" {
continue
}
return true
}
return false
}
func attrVal(n *html.Node, key string) string {
for _, a := range n.Attr {
if strings.EqualFold(a.Key, key) {
return a.Val
}
}
return ""
}
func stripHiddenEmailHTMLRegex(raw string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?is)<span[^>]*class="[^"]*mcnPreviewText[^"]*"[^>]*>.*?</span>`),
regexp.MustCompile(`(?is)<div[^>]*style="[^"]*display\s*:\s*none[^"]*"[^>]*>.*?</div>`),
}
out := raw
for _, re := range patterns {
out = re.ReplaceAllString(out, "")
}
return out
}
func isInvisiblePaddingRune(r rune) bool {
switch r {
case '\u034f', '\u200b', '\u200c', '\u200d', '\u200e', '\u200f', '\ufeff', '\u00a0', '\u2007':
return true
default:
return false
}
}
// StripInvisibleTextRuns removes repeated invisible Unicode padding from plain text.
// Line breaks are preserved so reply quotes stay splittable in the UI.
func StripInvisibleTextRuns(s string) string {
if s == "" {
return s
}
var b strings.Builder
b.Grow(len(s))
for _, r := range s {
if isInvisiblePaddingRune(r) {
continue
}
b.WriteRune(r)
}
lines := strings.Split(b.String(), "\n")
for i, line := range lines {
lines[i] = strings.Join(strings.Fields(line), " ")
}
return strings.TrimSpace(strings.Join(lines, "\n"))
}