- Added repairUTF8Mojibake function to fix UTF-8 text misread as Latin-1, addressing common encoding issues in email bodies. - Enhanced RepairStoredBodies and RepairSnippetWithBodies functions to utilize the new mojibake repair logic. - Introduced unit tests for mojibake repair functionality to ensure accurate text restoration. - Updated charset handling in repairLegacyCharsetString to incorporate mojibake repair, improving overall text processing reliability.
160 lines
3.7 KiB
Go
160 lines
3.7 KiB
Go
package imap
|
|
|
|
import (
|
|
"mime"
|
|
"regexp"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/encoding"
|
|
"golang.org/x/text/encoding/charmap"
|
|
"golang.org/x/text/encoding/htmlindex"
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// UTF-8 misread as Latin-1: "réactivité" (U+00C3 U+00A9 …).
|
|
var utf8MojibakeRE = regexp.MustCompile(`[\xC2\xC3][\x80-\xBF]`)
|
|
|
|
func charsetFromContentType(contentType string) string {
|
|
if contentType == "" {
|
|
return ""
|
|
}
|
|
_, params, err := mime.ParseMediaType(contentType)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return strings.Trim(strings.TrimSpace(params["charset"]), `"`)
|
|
}
|
|
|
|
func isUTF8Charset(charset string) bool {
|
|
switch strings.ToLower(strings.TrimSpace(charset)) {
|
|
case "", "utf-8", "utf8", "unicode-1-1-utf-8":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// decodeBodyBytesToUTF8 converts a MIME part payload to UTF-8 using Content-Type charset.
|
|
func decodeBodyBytesToUTF8(data []byte, contentType string) string {
|
|
if len(data) == 0 {
|
|
return ""
|
|
}
|
|
charset := charsetFromContentType(contentType)
|
|
if charset != "" && !isUTF8Charset(charset) {
|
|
if decoded := decodeBytesWithCharset(data, charset); decoded != "" {
|
|
return decoded
|
|
}
|
|
}
|
|
if utf8.Valid(data) {
|
|
return string(data)
|
|
}
|
|
return repairRawBytesToUTF8(data)
|
|
}
|
|
|
|
func decodeBytesWithCharset(data []byte, charset string) string {
|
|
enc, err := htmlindex.Get(charset)
|
|
if err != nil || enc == nil {
|
|
return ""
|
|
}
|
|
decoded, err := enc.NewDecoder().Bytes(data)
|
|
if err != nil || !utf8.Valid(decoded) {
|
|
return ""
|
|
}
|
|
return string(decoded)
|
|
}
|
|
|
|
// repairRawBytesToUTF8 fixes bodies stored without charset conversion (Latin-1 / Windows-1252).
|
|
func repairRawBytesToUTF8(data []byte) string {
|
|
if len(data) == 0 {
|
|
return ""
|
|
}
|
|
if utf8.Valid(data) {
|
|
return string(data)
|
|
}
|
|
for _, enc := range []encoding.Encoding{charmap.Windows1252, charmap.ISO8859_1} {
|
|
decoded, _, err := transform.Bytes(enc.NewDecoder(), data)
|
|
if err == nil && utf8.Valid(decoded) && isMostlyReadableText(decoded) {
|
|
return string(decoded)
|
|
}
|
|
}
|
|
return strings.ToValidUTF8(string(data), "")
|
|
}
|
|
|
|
func looksLikeUTF8Mojibake(s string) bool {
|
|
return utf8MojibakeRE.MatchString(s)
|
|
}
|
|
|
|
// repairUTF8Mojibake fixes UTF-8 text misread as Latin-1 (e.g. "réactivité" → "réactivité").
|
|
// Repairs pair-by-pair so mixed/corrupted sequences (e.g. NBSP → space in "Déjà") still partially fix.
|
|
func repairUTF8Mojibake(s string) string {
|
|
if s == "" || !looksLikeUTF8Mojibake(s) {
|
|
return s
|
|
}
|
|
runes := []rune(s)
|
|
var b strings.Builder
|
|
b.Grow(len(s))
|
|
for i := 0; i < len(runes); i++ {
|
|
r := runes[i]
|
|
if (r == 0xC2 || r == 0xC3) && i+1 < len(runes) {
|
|
next := runes[i+1]
|
|
if next >= 0x80 && next <= 0xBF {
|
|
seq := []byte{byte(r), byte(next)}
|
|
if utf8.Valid(seq) {
|
|
decoded, _ := utf8.DecodeRune(seq)
|
|
b.WriteRune(decoded)
|
|
i++
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
b.WriteRune(r)
|
|
}
|
|
out := b.String()
|
|
if out == s {
|
|
return s
|
|
}
|
|
return repairLoneMojibakeLeaders(out)
|
|
}
|
|
|
|
func repairLoneMojibakeLeaders(s string) string {
|
|
runes := []rune(s)
|
|
var b strings.Builder
|
|
b.Grow(len(s))
|
|
for i := 0; i < len(runes); i++ {
|
|
r := runes[i]
|
|
if r == 0xC3 || r == 0xC2 {
|
|
if i+1 < len(runes) && isLoneMojibakeLeaderBoundary(runes[i+1]) {
|
|
if r == 0xC3 {
|
|
b.WriteRune('à')
|
|
} else {
|
|
b.WriteRune('Â')
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
b.WriteRune(r)
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func isLoneMojibakeLeaderBoundary(r rune) bool {
|
|
switch r {
|
|
case ' ', '\t', ',', '.', ';', ':', '!', '?':
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// repairLegacyCharsetString fixes text already loaded as a Go string with invalid UTF-8 bytes.
|
|
func repairLegacyCharsetString(s string) string {
|
|
if s == "" {
|
|
return s
|
|
}
|
|
if !utf8.ValidString(s) {
|
|
s = repairRawBytesToUTF8([]byte(s))
|
|
}
|
|
return repairUTF8Mojibake(s)
|
|
}
|