package imap import ( "mime" "strings" "unicode/utf8" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/htmlindex" "golang.org/x/text/transform" ) func charsetFromContentType(contentType string) string { if contentType == "" { return "" } _, params, err := mime.ParseMediaType(contentType) if err != nil { return "" } return strings.Trim(strings.TrimSpace(params["charset"]), `"`) } func isUTF8Charset(charset string) bool { switch strings.ToLower(strings.TrimSpace(charset)) { case "", "utf-8", "utf8", "unicode-1-1-utf-8": return true default: return false } } // decodeBodyBytesToUTF8 converts a MIME part payload to UTF-8 using Content-Type charset. func decodeBodyBytesToUTF8(data []byte, contentType string) string { if len(data) == 0 { return "" } charset := charsetFromContentType(contentType) if charset != "" && !isUTF8Charset(charset) { if decoded := decodeBytesWithCharset(data, charset); decoded != "" { return decoded } } if utf8.Valid(data) { return string(data) } return repairRawBytesToUTF8(data) } func decodeBytesWithCharset(data []byte, charset string) string { enc, err := htmlindex.Get(charset) if err != nil || enc == nil { return "" } decoded, err := enc.NewDecoder().Bytes(data) if err != nil || !utf8.Valid(decoded) { return "" } return string(decoded) } // repairRawBytesToUTF8 fixes bodies stored without charset conversion (Latin-1 / Windows-1252). func repairRawBytesToUTF8(data []byte) string { if len(data) == 0 { return "" } if utf8.Valid(data) { return string(data) } for _, enc := range []encoding.Encoding{charmap.Windows1252, charmap.ISO8859_1} { decoded, _, err := transform.Bytes(enc.NewDecoder(), data) if err == nil && utf8.Valid(decoded) && isMostlyReadableText(decoded) { return string(decoded) } } return strings.ToValidUTF8(string(data), "") } // repairLegacyCharsetString fixes text already loaded as a Go string with invalid UTF-8 bytes. func repairLegacyCharsetString(s string) string { if s == "" || utf8.ValidString(s) { return s } return repairRawBytesToUTF8([]byte(s)) }