ultisuite-backend/internal/mail/imap/charset.go

package imap

import (
	"mime"
	"regexp"
	"strings"
	"unicode/utf8"

	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/charmap"
	"golang.org/x/text/encoding/htmlindex"
	"golang.org/x/text/transform"
)

// UTF-8 misread as Latin-1: "rÃ©activitÃ©" (U+00C3 U+00A9 …).
var utf8MojibakeRE = regexp.MustCompile(`[\xC2\xC3][\x80-\xBF]`)

func charsetFromContentType(contentType string) string {
	if contentType == "" {
		return ""
	}
	_, params, err := mime.ParseMediaType(contentType)
	if err != nil {
		return ""
	}
	return strings.Trim(strings.TrimSpace(params["charset"]), `"`)
}

func isUTF8Charset(charset string) bool {
	switch strings.ToLower(strings.TrimSpace(charset)) {
	case "", "utf-8", "utf8", "unicode-1-1-utf-8":
		return true
	default:
		return false
	}
}

// decodeBodyBytesToUTF8 converts a MIME part payload to UTF-8 using Content-Type charset.
func decodeBodyBytesToUTF8(data []byte, contentType string) string {
	if len(data) == 0 {
		return ""
	}
	charset := charsetFromContentType(contentType)
	if charset != "" && !isUTF8Charset(charset) {
		if decoded := decodeBytesWithCharset(data, charset); decoded != "" {
			return decoded
		}
	}
	if utf8.Valid(data) {
		return string(data)
	}
	return repairRawBytesToUTF8(data)
}

func decodeBytesWithCharset(data []byte, charset string) string {
	enc, err := htmlindex.Get(charset)
	if err != nil || enc == nil {
		return ""
	}
	decoded, err := enc.NewDecoder().Bytes(data)
	if err != nil || !utf8.Valid(decoded) {
		return ""
	}
	return string(decoded)
}

// repairRawBytesToUTF8 fixes bodies stored without charset conversion (Latin-1 / Windows-1252).
func repairRawBytesToUTF8(data []byte) string {
	if len(data) == 0 {
		return ""
	}
	if utf8.Valid(data) {
		return string(data)
	}
	for _, enc := range []encoding.Encoding{charmap.Windows1252, charmap.ISO8859_1} {
		decoded, _, err := transform.Bytes(enc.NewDecoder(), data)
		if err == nil && utf8.Valid(decoded) && isMostlyReadableText(decoded) {
			return string(decoded)
		}
	}
	return strings.ToValidUTF8(string(data), "")
}

func looksLikeUTF8Mojibake(s string) bool {
	return utf8MojibakeRE.MatchString(s)
}

// repairUTF8Mojibake fixes UTF-8 text misread as Latin-1 (e.g. "rÃ©activitÃ©" → "réactivité").
// Repairs pair-by-pair so mixed/corrupted sequences (e.g. NBSP → space in "Déjà") still partially fix.
func repairUTF8Mojibake(s string) string {
	if s == "" || !looksLikeUTF8Mojibake(s) {
		return s
	}
	runes := []rune(s)
	var b strings.Builder
	b.Grow(len(s))
	for i := 0; i < len(runes); i++ {
		r := runes[i]
		if (r == 0xC2 || r == 0xC3) && i+1 < len(runes) {
			next := runes[i+1]
			if next >= 0x80 && next <= 0xBF {
				seq := []byte{byte(r), byte(next)}
				if utf8.Valid(seq) {
					decoded, _ := utf8.DecodeRune(seq)
					b.WriteRune(decoded)
					i++
					continue
				}
			}
		}
		b.WriteRune(r)
	}
	out := b.String()
	if out == s {
		return s
	}
	return repairLoneMojibakeLeaders(out)
}

func repairLoneMojibakeLeaders(s string) string {
	runes := []rune(s)
	var b strings.Builder
	b.Grow(len(s))
	for i := 0; i < len(runes); i++ {
		r := runes[i]
		if r == 0xC3 || r == 0xC2 {
			if i+1 < len(runes) && isLoneMojibakeLeaderBoundary(runes[i+1]) {
				if r == 0xC3 {
					b.WriteRune('à')
				} else {
					b.WriteRune('Â')
				}
				continue
			}
		}
		b.WriteRune(r)
	}
	return b.String()
}

func isLoneMojibakeLeaderBoundary(r rune) bool {
	switch r {
	case ' ', '\t', ',', '.', ';', ':', '!', '?':
		return true
	default:
		return false
	}
}

// repairLegacyCharsetString fixes text already loaded as a Go string with invalid UTF-8 bytes.
func repairLegacyCharsetString(s string) string {
	if s == "" {
		return s
	}
	if !utf8.ValidString(s) {
		s = repairRawBytesToUTF8([]byte(s))
	}
	return repairUTF8Mojibake(s)
}