ultisuite-backend/internal/mail/imap/body_repair.go
R3D347HR4Y e6a04fdd31
Some checks failed
CI / Go tests (push) Has been cancelled
CI / Integration tests (push) Has been cancelled
CI / DB migrations (push) Has been cancelled
feat(mail): implement UTF-8 mojibake repair functionality
- Added repairUTF8Mojibake function to fix UTF-8 text misread as Latin-1, addressing common encoding issues in email bodies.
- Enhanced RepairStoredBodies and RepairSnippetWithBodies functions to utilize the new mojibake repair logic.
- Introduced unit tests for mojibake repair functionality to ensure accurate text restoration.
- Updated charset handling in repairLegacyCharsetString to incorporate mojibake repair, improving overall text processing reliability.
2026-06-18 11:11:36 +02:00

199 lines
4.9 KiB
Go

package imap
import (
"io"
"mime/quotedprintable"
"strings"
"unicode"
"github.com/ultisuite/ulti-backend/internal/mail/sanitize"
)
const minBareBase64Len = 24
// RepairStoredBodies fixes bodies stored as raw MIME, quoted-printable, or base64.
func RepairStoredBodies(text, html string) (string, string) {
text = repairLegacyCharsetString(text)
html = repairLegacyCharsetString(html)
text, html = repairRawMIME(text, html)
text = decodeBareQuotedPrintableIfNeeded(text)
html = decodeBareQuotedPrintableIfNeeded(html)
text = decodeBareBase64IfNeeded(text)
html = decodeBareBase64IfNeeded(html)
text = stripPlainTextPreheaderPadding(text)
text = repairUTF8Mojibake(text)
html = repairUTF8Mojibake(html)
return text, html
}
func repairRawMIME(text, html string) (string, string) {
if !looksLikeRawMIME(text) && !looksLikeRawMIME(html) {
return text, html
}
raw := text
if raw == "" {
raw = html
}
t, h := parseBody([]byte(raw))
if t == "" && h == "" {
return text, html
}
if looksLikeRawMIME(t) || looksLikeRawMIME(h) {
return text, html
}
return t, h
}
// RepairSnippet fixes list/search previews stored as undecoded base64 or raw MIME.
func RepairSnippet(snippet string) string {
return RepairSnippetWithBodies(snippet, "", "")
}
// RepairSnippetWithBodies decodes a stored snippet and optionally rebuilds from bodies.
func RepairSnippetWithBodies(snippet, bodyText, bodyHTML string) string {
snippet = repairUTF8Mojibake(stripSnippetMarkup(snippet))
if decoded := decodeBareQuotedPrintableIfNeeded(snippet); decoded != snippet {
snippet = stripSnippetMarkup(decoded)
}
if decoded := decodeBareBase64IfNeeded(snippet); decoded != snippet {
snippet = stripSnippetMarkup(decoded)
}
snippet = stripSnippetMarkup(stripPlainTextPreheaderPadding(snippet))
if looksLikeRawMIME(snippet) {
t, h, ok := parseEmbeddedMIME([]byte(snippet))
if ok {
return SnippetFromBodies(t, h, 200)
}
}
bodyText, bodyHTML = RepairStoredBodies(bodyText, bodyHTML)
if bodyText != "" || bodyHTML != "" {
rebuilt := SnippetFromBodies(bodyText, bodyHTML, 200)
storedBad := snippet == "" || SnippetLooksLowQuality(snippet) || isSnippetBoilerplate(snippet)
if rebuilt != "" && (storedBad || snippetLineScore(rebuilt) > snippetLineScore(snippet)) {
return rebuilt
}
if storedBad {
return rebuilt
}
}
if snippet == "" {
return ""
}
if SnippetLooksLowQuality(snippet) {
return ""
}
return stripSnippetMarkup(snippet)
}
func stripPlainTextPreheaderPadding(text string) string {
return sanitize.StripInvisibleTextRuns(text)
}
func stripHTMLForSnippet(html string) string {
if html == "" {
return ""
}
html = sanitize.StripHiddenEmailHTML(html)
var b strings.Builder
inTag := false
for _, r := range html {
switch {
case r == '<':
inTag = true
case r == '>':
inTag = false
case !inTag && r != '\r':
if r == '\n' {
if b.Len() > 0 && b.String()[b.Len()-1] != ' ' {
b.WriteRune(' ')
}
} else if !unicode.IsControl(r) {
b.WriteRune(r)
}
}
}
return sanitize.StripInvisibleTextRuns(strings.Join(strings.Fields(b.String()), " "))
}
func decodeBareQuotedPrintableIfNeeded(s string) string {
if s == "" || !looksLikeQuotedPrintable(s) {
return s
}
decoded, err := io.ReadAll(quotedprintable.NewReader(strings.NewReader(s)))
if err != nil || len(decoded) == 0 || !isMostlyReadableText(decoded) {
return s
}
return decodeBodyBytesToUTF8(decoded, "")
}
func looksLikeQuotedPrintable(s string) bool {
if strings.Contains(s, "=\r\n") || strings.Contains(s, "=\n") {
return true
}
if strings.Contains(s, "=3D") || strings.Contains(s, "=C3=") || strings.Contains(s, "=E2=") {
return true
}
return len(qpHexSeqRE.FindAllString(s, -1)) >= 3
}
func decodeBareBase64IfNeeded(s string) string {
if s == "" {
return s
}
trimmed := strings.TrimSpace(s)
if len(trimmed) < minBareBase64Len {
return s
}
clean := stripBase64Whitespace(trimmed)
if !isLikelyBase64(clean) {
return s
}
decoded, err := decodeBase64Body([]byte(clean))
if err != nil || len(decoded) == 0 || !isMostlyReadableText(decoded) {
return s
}
return decodeBodyBytesToUTF8(decoded, "")
}
func stripBase64Whitespace(s string) string {
var b strings.Builder
b.Grow(len(s))
for _, r := range s {
switch r {
case '\r', '\n', ' ', '\t':
continue
default:
b.WriteRune(r)
}
}
return b.String()
}
func isLikelyBase64(s string) bool {
if len(s) < minBareBase64Len || len(s)%4 != 0 {
return false
}
for _, r := range s {
switch {
case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z', r >= '0' && r <= '9', r == '+', r == '/', r == '=':
continue
default:
return false
}
}
return strings.Contains(s, "=") || len(s) >= 32
}
func isMostlyReadableText(b []byte) bool {
if len(b) == 0 {
return false
}
printable := 0
for _, c := range b {
if c == '\n' || c == '\r' || c == '\t' || (c >= 32 && c < 127) || c >= 0xc0 {
printable++
}
}
return float64(printable)/float64(len(b)) >= 0.85
}