package discovery import ( "html" "regexp" "strings" ) var ( sigDelimiterRe = regexp.MustCompile(`(?m)^--\s*$`) forwardedHeaderRe = regexp.MustCompile(`(?i)(?:^|\n)(?:-{5,}\s*)?(?:forwarded message|message transféré|message transmis|-----Original Message-----)`) quotedReplyStartRe = regexp.MustCompile(`(?is)(?:^|\n)\s*(?:` + `Le\s+.{4,200}?\s+a\s+écrit\s*:` + `|On\s+.{4,200}?\s+wrote\s*:` + `|Am\s+.{4,200}?\s+schrieb\s*:` + `|El\s+.{4,200}?\s+escribió\s*:` + `|Il\s+.{4,200}?\s+ha\s+scritto\s*:` + `|-----Original Message-----` + `|\n_{5,}` + `)`) replyHeaderBlockRe = regexp.MustCompile(`(?is)\n\s*(?:De|From)\s*:\s*.+\n\s*(?:Envoyé|Sent|Date)\s*:`) replyHeaderInSigRe = regexp.MustCompile(`(?i)(?:\bwrote\s*:|\ba\s+écrit\s*:|-----Original|Envoyé\s*:|Sent\s*:|schrieb\s*:|escribió\s*:|ha\s+scritto\s*:)`) phoneInSigRe = regexp.MustCompile(`(?:\+?\d[\d\s().-]{7,}\d)`) emailInSigRe = regexp.MustCompile(`(?i)[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}`) titleKeywordsRe = regexp.MustCompile(`(?i)(?:directeur|director|manager|ceo|cto|engineer|consultant|developer|president|founder|co-founder|chef|responsable)`) ) func stripHTMLTags(s string) string { if len(s) > 8000 { s = s[len(s)-8000:] } s = regexp.MustCompile(`(?is)]*>.*?`).ReplaceAllString(s, "") s = regexp.MustCompile(`(?is)]*>.*?`).ReplaceAllString(s, "") s = regexp.MustCompile(`(?i)`).ReplaceAllString(s, "\n") s = regexp.MustCompile(`(?i)

`).ReplaceAllString(s, "\n") s = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(s, "") return html.UnescapeString(s) } func stripQuotedPrefixLines(raw string) string { lines := strings.Split(raw, "\n") cut := len(lines) for i, line := range lines { if i < 2 { continue } trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, ">") { cut = i break } } if cut < len(lines) { raw = strings.TrimSpace(strings.Join(lines[:cut], "\n")) } return raw } func stripQuotedReplyContent(raw string) string { if loc := forwardedHeaderRe.FindStringIndex(raw); loc != nil { raw = raw[:loc[0]] } if loc := quotedReplyStartRe.FindStringIndex(raw); loc != nil && loc[0] >= 15 { raw = raw[:loc[0]] } if loc := replyHeaderBlockRe.FindStringIndex(raw); loc != nil && loc[0] >= 15 { raw = raw[:loc[0]] } return stripQuotedPrefixLines(raw) } func emailsInText(s string) []string { found := emailInSigRe.FindAllString(strings.ToLower(s), -1) if len(found) == 0 { return nil } seen := map[string]struct{}{} var out []string for _, e := range found { e = strings.TrimSpace(e) if e == "" { continue } if _, ok := seen[e]; ok { continue } seen[e] = struct{}{} out = append(out, e) } return out } func signatureMatchesSender(candidate, senderEmail, displayName string) bool { if replyHeaderInSigRe.MatchString(candidate) { return false } sender := strings.ToLower(strings.TrimSpace(senderEmail)) if sender == "" { return true } for _, e := range emailsInText(candidate) { if e != sender { return false } } if displayName != "" { nameTokens := strings.Fields(strings.ToLower(displayName)) if len(nameTokens) >= 2 { candidateLower := strings.ToLower(candidate) matches := 0 for _, tok := range nameTokens { if len(tok) < 2 { continue } if strings.Contains(candidateLower, tok) { matches++ } } if matches == 0 && !phoneInSigRe.MatchString(candidate) && len(emailsInText(candidate)) == 0 { return false } } } return true } func extractSignature(bodyText, bodyHTML, senderEmail, displayName string) (text string, confidence float64) { raw := strings.TrimSpace(bodyText) if len(raw) > 8000 { raw = raw[len(raw)-8000:] } if raw == "" && bodyHTML != "" { raw = stripHTMLTags(bodyHTML) } if raw == "" { return "", 0 } raw = stripQuotedReplyContent(raw) raw = strings.TrimSpace(raw) if raw == "" { return "", 0 } parts := sigDelimiterRe.Split(raw, -1) candidate := strings.TrimSpace(raw) confidence = 0.3 if len(parts) > 1 { candidate = strings.TrimSpace(parts[len(parts)-1]) confidence = 0.7 } else { lines := strings.Split(raw, "\n") if len(lines) > 4 { start := len(lines) - 8 if start < 0 { start = 0 } tail := strings.Join(lines[start:], "\n") if phoneInSigRe.MatchString(tail) || emailInSigRe.MatchString(tail) || titleKeywordsRe.MatchString(tail) { candidate = strings.TrimSpace(tail) confidence = 0.55 } } } candidate = strings.TrimSpace(candidate) if len(candidate) < 10 || len(candidate) > 2000 { return "", 0 } if !signatureMatchesSender(candidate, senderEmail, displayName) { return "", 0 } senderLower := strings.ToLower(strings.TrimSpace(senderEmail)) if senderLower != "" { local := strings.Split(senderLower, "@")[0] if local != "" && !strings.Contains(strings.ToLower(candidate), local) { if !emailInSigRe.MatchString(candidate) && !phoneInSigRe.MatchString(candidate) { confidence *= 0.6 } } } if confidence < 0.35 { return "", 0 } return candidate, confidence } func detectForwardedAddresses(bodyText, bodyHTML string) []string { raw := bodyText if raw == "" && bodyHTML != "" { raw = stripHTMLTags(bodyHTML) } if raw == "" { return nil } var out []string seen := map[string]struct{}{} fromLineRe := regexp.MustCompile(`(?im)^(?:from|de|expéditeur)\s*:\s*(?:[^<\n]*<)?([a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,})>?`) for _, m := range fromLineRe.FindAllStringSubmatch(raw, -1) { if len(m) > 1 { email := strings.ToLower(strings.TrimSpace(m[1])) if _, ok := seen[email]; !ok && email != "" { seen[email] = struct{}{} out = append(out, email) } } } return out }