ultisuite-backend/internal/mail/imap/charset.go
R3D347HR4Y e6a04fdd31
Some checks failed
CI / Go tests (push) Has been cancelled
CI / Integration tests (push) Has been cancelled
CI / DB migrations (push) Has been cancelled
feat(mail): implement UTF-8 mojibake repair functionality
- Added repairUTF8Mojibake function to fix UTF-8 text misread as Latin-1, addressing common encoding issues in email bodies.
- Enhanced RepairStoredBodies and RepairSnippetWithBodies functions to utilize the new mojibake repair logic.
- Introduced unit tests for mojibake repair functionality to ensure accurate text restoration.
- Updated charset handling in repairLegacyCharsetString to incorporate mojibake repair, improving overall text processing reliability.
2026-06-18 11:11:36 +02:00

160 lines
3.7 KiB
Go

package imap
import (
"mime"
"regexp"
"strings"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/htmlindex"
"golang.org/x/text/transform"
)
// UTF-8 misread as Latin-1: "réactivité" (U+00C3 U+00A9 …).
var utf8MojibakeRE = regexp.MustCompile(`[\xC2\xC3][\x80-\xBF]`)
func charsetFromContentType(contentType string) string {
if contentType == "" {
return ""
}
_, params, err := mime.ParseMediaType(contentType)
if err != nil {
return ""
}
return strings.Trim(strings.TrimSpace(params["charset"]), `"`)
}
func isUTF8Charset(charset string) bool {
switch strings.ToLower(strings.TrimSpace(charset)) {
case "", "utf-8", "utf8", "unicode-1-1-utf-8":
return true
default:
return false
}
}
// decodeBodyBytesToUTF8 converts a MIME part payload to UTF-8 using Content-Type charset.
func decodeBodyBytesToUTF8(data []byte, contentType string) string {
if len(data) == 0 {
return ""
}
charset := charsetFromContentType(contentType)
if charset != "" && !isUTF8Charset(charset) {
if decoded := decodeBytesWithCharset(data, charset); decoded != "" {
return decoded
}
}
if utf8.Valid(data) {
return string(data)
}
return repairRawBytesToUTF8(data)
}
func decodeBytesWithCharset(data []byte, charset string) string {
enc, err := htmlindex.Get(charset)
if err != nil || enc == nil {
return ""
}
decoded, err := enc.NewDecoder().Bytes(data)
if err != nil || !utf8.Valid(decoded) {
return ""
}
return string(decoded)
}
// repairRawBytesToUTF8 fixes bodies stored without charset conversion (Latin-1 / Windows-1252).
func repairRawBytesToUTF8(data []byte) string {
if len(data) == 0 {
return ""
}
if utf8.Valid(data) {
return string(data)
}
for _, enc := range []encoding.Encoding{charmap.Windows1252, charmap.ISO8859_1} {
decoded, _, err := transform.Bytes(enc.NewDecoder(), data)
if err == nil && utf8.Valid(decoded) && isMostlyReadableText(decoded) {
return string(decoded)
}
}
return strings.ToValidUTF8(string(data), "")
}
func looksLikeUTF8Mojibake(s string) bool {
return utf8MojibakeRE.MatchString(s)
}
// repairUTF8Mojibake fixes UTF-8 text misread as Latin-1 (e.g. "réactivité" → "réactivité").
// Repairs pair-by-pair so mixed/corrupted sequences (e.g. NBSP → space in "Déjà") still partially fix.
func repairUTF8Mojibake(s string) string {
if s == "" || !looksLikeUTF8Mojibake(s) {
return s
}
runes := []rune(s)
var b strings.Builder
b.Grow(len(s))
for i := 0; i < len(runes); i++ {
r := runes[i]
if (r == 0xC2 || r == 0xC3) && i+1 < len(runes) {
next := runes[i+1]
if next >= 0x80 && next <= 0xBF {
seq := []byte{byte(r), byte(next)}
if utf8.Valid(seq) {
decoded, _ := utf8.DecodeRune(seq)
b.WriteRune(decoded)
i++
continue
}
}
}
b.WriteRune(r)
}
out := b.String()
if out == s {
return s
}
return repairLoneMojibakeLeaders(out)
}
func repairLoneMojibakeLeaders(s string) string {
runes := []rune(s)
var b strings.Builder
b.Grow(len(s))
for i := 0; i < len(runes); i++ {
r := runes[i]
if r == 0xC3 || r == 0xC2 {
if i+1 < len(runes) && isLoneMojibakeLeaderBoundary(runes[i+1]) {
if r == 0xC3 {
b.WriteRune('à')
} else {
b.WriteRune('Â')
}
continue
}
}
b.WriteRune(r)
}
return b.String()
}
func isLoneMojibakeLeaderBoundary(r rune) bool {
switch r {
case ' ', '\t', ',', '.', ';', ':', '!', '?':
return true
default:
return false
}
}
// repairLegacyCharsetString fixes text already loaded as a Go string with invalid UTF-8 bytes.
func repairLegacyCharsetString(s string) string {
if s == "" {
return s
}
if !utf8.ValidString(s) {
s = repairRawBytesToUTF8([]byte(s))
}
return repairUTF8Mojibake(s)
}