package imap import ( "mime" "regexp" "strings" "unicode/utf8" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/htmlindex" "golang.org/x/text/transform" ) // UTF-8 misread as Latin-1: "réactivité" (U+00C3 U+00A9 …). var utf8MojibakeRE = regexp.MustCompile(`[\xC2\xC3][\x80-\xBF]`) func charsetFromContentType(contentType string) string { if contentType == "" { return "" } _, params, err := mime.ParseMediaType(contentType) if err != nil { return "" } return strings.Trim(strings.TrimSpace(params["charset"]), `"`) } func isUTF8Charset(charset string) bool { switch strings.ToLower(strings.TrimSpace(charset)) { case "", "utf-8", "utf8", "unicode-1-1-utf-8": return true default: return false } } // decodeBodyBytesToUTF8 converts a MIME part payload to UTF-8 using Content-Type charset. func decodeBodyBytesToUTF8(data []byte, contentType string) string { if len(data) == 0 { return "" } charset := charsetFromContentType(contentType) if charset != "" && !isUTF8Charset(charset) { if decoded := decodeBytesWithCharset(data, charset); decoded != "" { return decoded } } if utf8.Valid(data) { return string(data) } return repairRawBytesToUTF8(data) } func decodeBytesWithCharset(data []byte, charset string) string { enc, err := htmlindex.Get(charset) if err != nil || enc == nil { return "" } decoded, err := enc.NewDecoder().Bytes(data) if err != nil || !utf8.Valid(decoded) { return "" } return string(decoded) } // repairRawBytesToUTF8 fixes bodies stored without charset conversion (Latin-1 / Windows-1252). func repairRawBytesToUTF8(data []byte) string { if len(data) == 0 { return "" } if utf8.Valid(data) { return string(data) } for _, enc := range []encoding.Encoding{charmap.Windows1252, charmap.ISO8859_1} { decoded, _, err := transform.Bytes(enc.NewDecoder(), data) if err == nil && utf8.Valid(decoded) && isMostlyReadableText(decoded) { return string(decoded) } } return strings.ToValidUTF8(string(data), "") } func looksLikeUTF8Mojibake(s string) bool { return utf8MojibakeRE.MatchString(s) } // repairUTF8Mojibake fixes UTF-8 text misread as Latin-1 (e.g. "réactivité" → "réactivité"). // Repairs pair-by-pair so mixed/corrupted sequences (e.g. NBSP → space in "Déjà") still partially fix. func repairUTF8Mojibake(s string) string { if s == "" || !looksLikeUTF8Mojibake(s) { return s } runes := []rune(s) var b strings.Builder b.Grow(len(s)) for i := 0; i < len(runes); i++ { r := runes[i] if (r == 0xC2 || r == 0xC3) && i+1 < len(runes) { next := runes[i+1] if next >= 0x80 && next <= 0xBF { seq := []byte{byte(r), byte(next)} if utf8.Valid(seq) { decoded, _ := utf8.DecodeRune(seq) b.WriteRune(decoded) i++ continue } } } b.WriteRune(r) } out := b.String() if out == s { return s } return repairLoneMojibakeLeaders(out) } func repairLoneMojibakeLeaders(s string) string { runes := []rune(s) var b strings.Builder b.Grow(len(s)) for i := 0; i < len(runes); i++ { r := runes[i] if r == 0xC3 || r == 0xC2 { if i+1 < len(runes) && isLoneMojibakeLeaderBoundary(runes[i+1]) { if r == 0xC3 { b.WriteRune('à') } else { b.WriteRune('Â') } continue } } b.WriteRune(r) } return b.String() } func isLoneMojibakeLeaderBoundary(r rune) bool { switch r { case ' ', '\t', ',', '.', ';', ':', '!', '?': return true default: return false } } // repairLegacyCharsetString fixes text already loaded as a Go string with invalid UTF-8 bytes. func repairLegacyCharsetString(s string) string { if s == "" { return s } if !utf8.ValidString(s) { s = repairRawBytesToUTF8([]byte(s)) } return repairUTF8Mojibake(s) }