- Improved the `RepairSnippetWithBodies` function to streamline snippet rebuilding logic and reduce redundancy. - Introduced new utility functions for stripping CSS noise and decoding HTML entities in snippets. - Enhanced boilerplate detection to better identify low-quality snippets, including legal footers and view-in-browser prompts. - Added comprehensive tests for new functionality and edge cases in snippet processing.
197 lines
4.8 KiB
Go
197 lines
4.8 KiB
Go
package imap
|
|
|
|
import (
|
|
"io"
|
|
"mime/quotedprintable"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/ultisuite/ulti-backend/internal/mail/sanitize"
|
|
)
|
|
|
|
const minBareBase64Len = 24
|
|
|
|
// RepairStoredBodies fixes bodies stored as raw MIME, quoted-printable, or base64.
|
|
func RepairStoredBodies(text, html string) (string, string) {
|
|
text = repairLegacyCharsetString(text)
|
|
html = repairLegacyCharsetString(html)
|
|
text, html = repairRawMIME(text, html)
|
|
text = decodeBareQuotedPrintableIfNeeded(text)
|
|
html = decodeBareQuotedPrintableIfNeeded(html)
|
|
text = decodeBareBase64IfNeeded(text)
|
|
html = decodeBareBase64IfNeeded(html)
|
|
text = stripPlainTextPreheaderPadding(text)
|
|
return text, html
|
|
}
|
|
|
|
func repairRawMIME(text, html string) (string, string) {
|
|
if !looksLikeRawMIME(text) && !looksLikeRawMIME(html) {
|
|
return text, html
|
|
}
|
|
raw := text
|
|
if raw == "" {
|
|
raw = html
|
|
}
|
|
t, h := parseBody([]byte(raw))
|
|
if t == "" && h == "" {
|
|
return text, html
|
|
}
|
|
if looksLikeRawMIME(t) || looksLikeRawMIME(h) {
|
|
return text, html
|
|
}
|
|
return t, h
|
|
}
|
|
|
|
// RepairSnippet fixes list/search previews stored as undecoded base64 or raw MIME.
|
|
func RepairSnippet(snippet string) string {
|
|
return RepairSnippetWithBodies(snippet, "", "")
|
|
}
|
|
|
|
// RepairSnippetWithBodies decodes a stored snippet and optionally rebuilds from bodies.
|
|
func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string {
|
|
snippet = stripSnippetMarkup(snippet)
|
|
if decoded := decodeBareQuotedPrintableIfNeeded(snippet); decoded != snippet {
|
|
snippet = stripSnippetMarkup(decoded)
|
|
}
|
|
if decoded := decodeBareBase64IfNeeded(snippet); decoded != snippet {
|
|
snippet = stripSnippetMarkup(decoded)
|
|
}
|
|
snippet = stripSnippetMarkup(stripPlainTextPreheaderPadding(snippet))
|
|
if looksLikeRawMIME(snippet) {
|
|
t, h, ok := parseEmbeddedMIME([]byte(snippet))
|
|
if ok {
|
|
return SnippetFromBodies(t, h, 200)
|
|
}
|
|
}
|
|
bodyText, bodyHTML = RepairStoredBodies(bodyText, bodyHTML)
|
|
if bodyText != "" || bodyHTML != "" {
|
|
rebuilt := SnippetFromBodies(bodyText, bodyHTML, 200)
|
|
storedBad := snippet == "" || SnippetLooksLowQuality(snippet) || isSnippetBoilerplate(snippet)
|
|
if rebuilt != "" && (storedBad || snippetLineScore(rebuilt) > snippetLineScore(snippet)) {
|
|
return rebuilt
|
|
}
|
|
if storedBad {
|
|
return rebuilt
|
|
}
|
|
}
|
|
if snippet == "" {
|
|
return ""
|
|
}
|
|
if SnippetLooksLowQuality(snippet) {
|
|
return ""
|
|
}
|
|
return stripSnippetMarkup(snippet)
|
|
}
|
|
|
|
func stripPlainTextPreheaderPadding(text string) string {
|
|
return sanitize.StripInvisibleTextRuns(text)
|
|
}
|
|
|
|
func stripHTMLForSnippet(html string) string {
|
|
if html == "" {
|
|
return ""
|
|
}
|
|
html = sanitize.StripHiddenEmailHTML(html)
|
|
var b strings.Builder
|
|
inTag := false
|
|
for _, r := range html {
|
|
switch {
|
|
case r == '<':
|
|
inTag = true
|
|
case r == '>':
|
|
inTag = false
|
|
case !inTag && r != '\r':
|
|
if r == '\n' {
|
|
if b.Len() > 0 && b.String()[b.Len()-1] != ' ' {
|
|
b.WriteRune(' ')
|
|
}
|
|
} else if !unicode.IsControl(r) {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
}
|
|
return sanitize.StripInvisibleTextRuns(strings.Join(strings.Fields(b.String()), " "))
|
|
}
|
|
|
|
func decodeBareQuotedPrintableIfNeeded(s string) string {
|
|
if s == "" || !looksLikeQuotedPrintable(s) {
|
|
return s
|
|
}
|
|
decoded, err := io.ReadAll(quotedprintable.NewReader(strings.NewReader(s)))
|
|
if err != nil || len(decoded) == 0 || !isMostlyReadableText(decoded) {
|
|
return s
|
|
}
|
|
return decodeBodyBytesToUTF8(decoded, "")
|
|
}
|
|
|
|
func looksLikeQuotedPrintable(s string) bool {
|
|
if strings.Contains(s, "=\r\n") || strings.Contains(s, "=\n") {
|
|
return true
|
|
}
|
|
if strings.Contains(s, "=3D") || strings.Contains(s, "=C3=") || strings.Contains(s, "=E2=") {
|
|
return true
|
|
}
|
|
return len(qpHexSeqRE.FindAllString(s, -1)) >= 3
|
|
}
|
|
|
|
func decodeBareBase64IfNeeded(s string) string {
|
|
if s == "" {
|
|
return s
|
|
}
|
|
trimmed := strings.TrimSpace(s)
|
|
if len(trimmed) < minBareBase64Len {
|
|
return s
|
|
}
|
|
clean := stripBase64Whitespace(trimmed)
|
|
if !isLikelyBase64(clean) {
|
|
return s
|
|
}
|
|
decoded, err := decodeBase64Body([]byte(clean))
|
|
if err != nil || len(decoded) == 0 || !isMostlyReadableText(decoded) {
|
|
return s
|
|
}
|
|
return decodeBodyBytesToUTF8(decoded, "")
|
|
}
|
|
|
|
func stripBase64Whitespace(s string) string {
|
|
var b strings.Builder
|
|
b.Grow(len(s))
|
|
for _, r := range s {
|
|
switch r {
|
|
case '\r', '\n', ' ', '\t':
|
|
continue
|
|
default:
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func isLikelyBase64(s string) bool {
|
|
if len(s) < minBareBase64Len || len(s)%4 != 0 {
|
|
return false
|
|
}
|
|
for _, r := range s {
|
|
switch {
|
|
case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z', r >= '0' && r <= '9', r == '+', r == '/', r == '=':
|
|
continue
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
return strings.Contains(s, "=") || len(s) >= 32
|
|
}
|
|
|
|
func isMostlyReadableText(b []byte) bool {
|
|
if len(b) == 0 {
|
|
return false
|
|
}
|
|
printable := 0
|
|
for _, c := range b {
|
|
if c == '\n' || c == '\r' || c == '\t' || (c >= 32 && c < 127) || c >= 0xc0 {
|
|
printable++
|
|
}
|
|
}
|
|
return float64(printable)/float64(len(b)) >= 0.85
|
|
}
|