- Added repairUTF8Mojibake function to fix UTF-8 text misread as Latin-1, addressing common encoding issues in email bodies. - Enhanced RepairStoredBodies and RepairSnippetWithBodies functions to utilize the new mojibake repair logic. - Introduced unit tests for mojibake repair functionality to ensure accurate text restoration. - Updated charset handling in repairLegacyCharsetString to incorporate mojibake repair, improving overall text processing reliability.
85 lines
3.0 KiB
Go
85 lines
3.0 KiB
Go
package imap
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestParseBody_iso88591Charset(t *testing.T) {
|
|
body := []byte("Vous avez un rendez-vous programm\xe9.\r\nLien de la r\xe9union.")
|
|
var b strings.Builder
|
|
b.WriteString("From: calendar@google.com\r\n")
|
|
b.WriteString("To: user@example.com\r\n")
|
|
b.WriteString("Subject: Invitation\r\n")
|
|
b.WriteString("Content-Type: text/plain; charset=iso-8859-1\r\n")
|
|
b.WriteString("Content-Transfer-Encoding: 8bit\r\n")
|
|
b.WriteString("\r\n")
|
|
b.Write(body)
|
|
|
|
text, html := parseBody([]byte(b.String()))
|
|
if html != "" {
|
|
t.Fatalf("html = %q, want empty", html)
|
|
}
|
|
if !strings.Contains(text, "programmé") {
|
|
t.Fatalf("text = %q, want iso-8859-1 accents", text)
|
|
}
|
|
if !strings.Contains(text, "réunion") {
|
|
t.Fatalf("text = %q, want réunion", text)
|
|
}
|
|
}
|
|
|
|
func TestRepairUTF8Mojibake_doubleEncodedFrench(t *testing.T) {
|
|
raw := "Si elle bouge, tu perds en réactivité. La NEXOR a été pensée pour ça."
|
|
repaired := repairUTF8Mojibake(raw)
|
|
want := "Si elle bouge, tu perds en réactivité. La NEXOR a été pensée pour ça."
|
|
if repaired != want {
|
|
t.Fatalf("repaired = %q, want %q", repaired, want)
|
|
}
|
|
}
|
|
|
|
func TestRepairUTF8Mojibake_longMarketingBody(t *testing.T) {
|
|
raw := "Si elle bouge, tu perds en réactivité. La NEXOR a été pensée pour ça. Nylon ultra résistant, boucle rapide, compatible 100% F1 et TSI. Sobre, solide, zéro fioritures. Détache la sangle, passe-la dans les passants, c'est prêt. Déjà adoptée par +8 000 pompiers en Europe. Elle était en rupture. Elle est de retour."
|
|
repaired := repairUTF8Mojibake(raw)
|
|
if strings.Contains(repaired, "Ã") {
|
|
t.Fatalf("repaired still has mojibake: %q", repaired)
|
|
}
|
|
if !strings.Contains(repaired, "réactivité") || !strings.Contains(repaired, "Déjà") {
|
|
t.Fatalf("repaired = %q", repaired)
|
|
}
|
|
}
|
|
|
|
func TestRepairUTF8Mojibake_leavesValidUTF8Untouched(t *testing.T) {
|
|
raw := "Si elle bouge, tu perds en réactivité."
|
|
if got := repairUTF8Mojibake(raw); got != raw {
|
|
t.Fatalf("repaired = %q, want unchanged", got)
|
|
}
|
|
}
|
|
|
|
func TestRepairSnippetWithBodies_mojibakePreview(t *testing.T) {
|
|
stored := "Nylon ultra résistant, boucle rapide, zéro fioritures."
|
|
got := RepairSnippetWithBodies(stored, "", "")
|
|
if strings.Contains(got, "Ã") {
|
|
t.Fatalf("snippet = %q, want accents repaired", got)
|
|
}
|
|
if !strings.Contains(got, "résistant") || !strings.Contains(got, "zéro") {
|
|
t.Fatalf("snippet = %q", got)
|
|
}
|
|
}
|
|
|
|
func TestRepairLegacyCharsetString_latin1BytesInString(t *testing.T) {
|
|
// Simulates DB row stored before charset decode (raw Latin-1 bytes in text column).
|
|
raw := string([]byte{0x56, 0x6f, 0x75, 0x73, 0x20, 0x72, 0xe9, 0x75, 0x6e, 0x69, 0x6f, 0x6e})
|
|
repaired := repairLegacyCharsetString(raw)
|
|
if repaired != "Vous réunion" {
|
|
t.Fatalf("repaired = %q", repaired)
|
|
}
|
|
}
|
|
|
|
func TestRepairStoredBodies_legacyLatin1(t *testing.T) {
|
|
raw := string([]byte("programm\xe9"))
|
|
text, _ := RepairStoredBodies(raw, "")
|
|
if text != "programmé" {
|
|
t.Fatalf("text = %q", text)
|
|
}
|
|
}
|