C# 解析电子邮件地址字符串的最佳方法
因此,我正在处理一些电子邮件标题数据,对于收件人:、发件人:、抄送:、密件抄送:字段,电子邮件地址可以用多种不同的方式表示:C# 解析电子邮件地址字符串的最佳方法,c#,.net,parsing,C#,.net,Parsing,因此,我正在处理一些电子邮件标题数据,对于收件人:、发件人:、抄送:、密件抄送:字段,电子邮件地址可以用多种不同的方式表示: First Last <name@domain.com> Last, First <name@domain.com> name@domain.com First-Last 最后,首先 name@domain.com 这些变体可以以任何顺序出现在同一条消息中,全部以逗号分隔的字符串显示: First, Last <name@domain.c
First Last <name@domain.com>
Last, First <name@domain.com>
name@domain.com
First-Last
最后,首先
name@domain.com
这些变体可以以任何顺序出现在同一条消息中,全部以逗号分隔的字符串显示:
First, Last <name@domain.com>, name@domain.com, First Last <name@domain.com>
第一个,最后一个,name@domain.com,第一个最后一个
我一直在试图找到一种方法,将这个字符串解析为每个人的单独的名字、姓氏和电子邮件(如果只提供了电子邮件地址,则省略名称)
有人能建议最好的方法吗
我试着在逗号上拆分,除了在第二个示例中姓氏放在第一位之外,这会起作用。我认为这个方法可以工作,如果在分割之后,我检查每个元素,看看它是否包含“@”或“”,如果不包含,那么可以假设下一个元素是第一个名称。这是一个好方法吗?我是否忽略了地址可能采用的另一种格式
更新:也许我应该澄清一点,基本上我所要做的就是将包含多个地址的字符串分解成单独的字符串,其中包含以任何格式发送的地址。我有自己的方法来验证和提取一个地址中的信息,但要找出分离每个地址的最佳方法对我来说很棘手
以下是我为实现这一目标而提出的解决方案:
String str = "Last, First <name@domain.com>, name@domain.com, First Last <name@domain.com>, \"First Last\" <name@domain.com>";
List<string> addresses = new List<string>();
int atIdx = 0;
int commaIdx = 0;
int lastComma = 0;
for (int c = 0; c < str.Length; c++)
{
if (str[c] == '@')
atIdx = c;
if (str[c] == ',')
commaIdx = c;
if (commaIdx > atIdx && atIdx > 0)
{
string temp = str.Substring(lastComma, commaIdx - lastComma);
addresses.Add(temp);
lastComma = commaIdx;
atIdx = commaIdx;
}
if (c == str.Length -1)
{
string temp = str.Substring(lastComma, str.Legth - lastComma);
addresses.Add(temp);
}
}
if (commaIdx < 2)
{
// if we get here we can assume either there was no comma, or there was only one comma as part of the last, first combo
addresses.Add(str);
}
String str = "Last, First <name@domain.com>, name@domain.com, First Last <name@domain.com>, \"First Last\" <name@domain.com>";
List<string> addresses = new List<string>();
int atIdx = 0;
int commaIdx = 0;
int lastComma = 0;
for (int c = 0; c < str.Length; c++)
{
if (str[c] == '@')
atIdx = c;
if (str[c] == ',')
commaIdx = c;
if (commaIdx > atIdx && atIdx > 0)
{
string temp = str.Substring(lastComma, commaIdx - lastComma);
addresses.Add(temp);
lastComma = commaIdx;
atIdx = commaIdx;
}
if (c == str.Length -1)
{
string temp = str.Substring(lastComma, str.Legth - lastComma);
addresses.Add(temp);
}
}
if (commaIdx < 2)
{
// if we get here we can assume either there was no comma, or there was only one comma as part of the last, first combo
addresses.Add(str);
}
String str=“Last,First,name@domain.com,First-Last,\“First-Last\”;
列表地址=新列表();
int-atIdx=0;
int commaIdx=0;
int lastcoma=0;
对于(int c=0;catIdx&&atIdx>0)
{
字符串temp=str.Substring(lastcoma,commaIdx-lastcoma);
地址。添加(临时);
lastcoma=commaIdx;
atIdx=commaIdx;
}
如果(c==str.Length-1)
{
字符串temp=str.Substring(lastcoma,str.Legth-lastcoma);
地址。添加(临时);
}
}
如果(commaIdx<2)
{
//如果我们到了这里,我们可以假设要么没有逗号,要么最后第一个组合中只有一个逗号
地址。添加(str);
}
上面的代码生成了我可以进一步处理的各个地址。这并不是一个简单的解决方案。我建议制作一个小的状态机,一个字符一个字符地读取,并以这种方式进行工作。就像你说的,用逗号分割并不总是有效的 状态机将允许您覆盖所有可能性。我相信还有很多你还没见过的人。例如:“第一个最后一个”
寻找关于这一点的RFC,以发现所有的可能性。对不起,我不知道电话号码。可能有多种,因为这是一种不断发展的事物。你可以使用正则表达式来尝试将其分离出来,试试这个家伙:
^(?<name1>[a-zA-Z0-9]+?),? (?<name2>[a-zA-Z0-9]+?),? (?<address1>[a-zA-Z0-9.-_<>]+?)$
^(?[a-zA-Z0-9]+?),?(?[a-zA-Z0-9]+?),?(?可以为您提供有关使用正则表达式匹配和验证电子邮件地址的更多信息。没有通用的简单解决方案。您需要的RFC是,它描述了电子邮件地址的所有可能配置。您将得到的最正确的方法是实现基于状态的遵循RFC中指定规则的标记器。以下是我将如何做到这一点:
- 您可以尝试标准化数据
尽可能地,即摆脱
像<和>符号之类的东西
和后面所有的逗号
“.com。”您将需要逗号
把第一个和最后一个分开
名字
- 在去掉多余的符号后,将每个分组的电子邮件
在列表中记录为字符串。您可以
可以使用.com来确定
如果需要,可以拆分字符串
- 在字符串列表中包含电子邮件地址列表后,您可以
然后可以进一步拆分电子邮件
仅使用空白作为地址的地址
纤度计
- 最后一步是确定什么是名字,什么是名字
姓氏等。这将完成
通过检查以下3个部件:a
逗号,表示
是姓;a.哪个会
注明实际地址;以及
剩下的就是名字。
如果没有逗号,那么第一个
名字第一,姓第二,
等
我不知道这是否是最简洁的解决方案,但它可以工作,并且不需要任何高级编程技术
冒着产生两个问题的风险,您可以创建一个与任何电子邮件格式匹配的正则表达式。使用“|”分隔此正则表达式中的格式。然后您可以在输入字符串上运行该正则表达式并提取所有匹配项
public class Address
{
private string _first;
private string _last;
private string _name;
private string _domain;
public Address(string first, string last, string name, string domain)
{
_first = first;
_last = last;
_name = name;
_domain = domain;
}
public string First
{
get { return _first; }
}
public string Last
{
get { return _last; }
}
public string Name
{
get { return _name; }
}
public string Domain
{
get { return _domain; }
}
}
[TestFixture]
public class RegexEmailTest
{
[Test]
public void TestThreeEmailAddresses()
{
Regex emailAddress = new Regex(
@"((?<last>\w*), (?<first>\w*) <(?<name>\w*)@(?<domain>\w*\.\w*)>)|" +
@"((?<first>\w*) (?<last>\w*) <(?<name>\w*)@(?<domain>\w*\.\w*)>)|" +
@"((?<name>\w*)@(?<domain>\w*\.\w*))");
string input = "First, Last <name@domain.com>, name@domain.com, First Last <name@domain.com>";
MatchCollection matches = emailAddress.Matches(input);
List<Address> addresses =
(from Match match in matches
select new Address(
match.Groups["first"].Value,
match.Groups["last"].Value,
match.Groups["name"].Value,
match.Groups["domain"].Value)).ToList();
Assert.AreEqual(3, addresses.Count);
Assert.AreEqual("Last", addresses[0].First);
Assert.AreEqual("First", addresses[0].Last);
Assert.AreEqual("name", addresses[0].Name);
Assert.AreEqual("domain.com", addresses[0].Domain);
Assert.AreEqual("", addresses[1].First);
Assert.AreEqual("", addresses[1].Last);
Assert.AreEqual("name", addresses[1].Name);
Assert.AreEqual("domain.com", addresses[1].Domain);
Assert.AreEqual("First", addresses[2].First);
Assert.AreEqual("Last", addresses[2].Last);
Assert.AreEqual("name", addresses[2].Name);
Assert.AreEqual("domain.com", addresses[2].Domain);
}
}
公共类地址
{
私有字符串_优先;
私有字符串_last;
私有字符串\u名称;
私有字符串_域;
公共地址(字符串第一、字符串最后、字符串名称、字符串域)
{
_第一=第一;
_最后一个=最后一个;
_名称=名称;
_域=域;
}
公共字符串优先
{
获取{return\u first;}
}
最后一个公共字符串
{
获取{return\u last;}
}
公共字符串名
{
获取{return\u name;}
}
公共字符串域
{
获取{返回_域;}
}
}
[测试夹具]
公共类RegexEmailTest
{
[测试]
public void TestThreeEmailAddresses()
{
Regex emailAddress=新的Regex(
@((?\w*),(?\w*)|+
@“((?\w*)(?\w*)|”+
@“((?\w*)@(?\w*\。\w*)”;
string input=“First,Last,name@domain.com“第一,最后”;
MatchCollection matches=emailAddress.matches(输入);
列出地址=
(从比赛中的比赛)
选择新地址(
匹配.Groups[“first”].值,
[A-Za-z0-9]+[A-Za-z0-9._-]+@[A-Za-z0-9]+[A-Za-z0-9._-]+[.][A-Za-z0-9]{2,3}
public class ParsedEmail
{
private string _first;
private string _last;
private string _name;
private string _domain;
public ParsedEmail(string first, string last, string name, string domain)
{
_name = name;
_domain = domain;
// first.last@domain.com, first_last@domain.com etc. syntax
char[] chars = { '.', '_', '+', '-' };
var pos = _name.IndexOfAny(chars);
if (string.IsNullOrWhiteSpace(_first) && string.IsNullOrWhiteSpace(_last) && pos > -1)
{
_first = _name.Substring(0, pos);
_last = _name.Substring(pos+1);
}
}
public string First
{
get { return _first; }
}
public string Last
{
get { return _last; }
}
public string Name
{
get { return _name; }
}
public string Domain
{
get { return _domain; }
}
public string Email
{
get
{
return Name + "@" + Domain;
}
}
public override string ToString()
{
return Email;
}
public static IEnumerable<ParsedEmail> SplitEmailList(string delimList)
{
delimList = delimList.Replace("\"", string.Empty);
Regex re = new Regex(
@"((?<last>\w*), (?<first>\w*) <(?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*)>)|" +
@"((?<first>\w*) (?<last>\w*) <(?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*)>)|" +
@"((?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*))");
MatchCollection matches = re.Matches(delimList);
var parsedEmails =
(from Match match in matches
select new ParsedEmail(
match.Groups["first"].Value,
match.Groups["last"].Value,
match.Groups["name"].Value,
match.Groups["domain"].Value)).ToList();
return parsedEmails;
}
}
private string GetProperlyFormattedEmailString(string emailString)
{
var emailStringParts = CSVProcessor.GetFieldsFromString(emailString);
string emailStringProcessed = "";
foreach (var part in emailStringParts)
{
try
{
var address = new MailAddress(part);
emailStringProcessed += address.Address + ",";
}
catch (Exception)
{
//wasn't an email address
throw;
}
}
return emailStringProcessed.TrimEnd((','));
}
public static string[] GetFieldsFromString(string csvString)
{
using (var stringAsReader = new StringReader(csvString))
{
using (var textFieldParser = new TextFieldParser(stringAsReader))
{
SetUpTextFieldParser(textFieldParser, FieldType.Delimited, new[] {","}, false, true);
try
{
return textFieldParser.ReadFields();
}
catch (MalformedLineException ex1)
{
//assume it's not parseable due to double quotes, so we strip them all out and take what we have
var sanitizedString = csvString.Replace("\"", "");
using (var sanitizedStringAsReader = new StringReader(sanitizedString))
{
using (var textFieldParser2 = new TextFieldParser(sanitizedStringAsReader))
{
SetUpTextFieldParser(textFieldParser2, FieldType.Delimited, new[] {","}, false, true);
try
{
return textFieldParser2.ReadFields().Select(part => part.Trim()).ToArray();
}
catch (MalformedLineException ex2)
{
return new string[] {csvString};
}
}
}
}
}
}
}
[Subject(typeof(CSVProcessor))]
public class when_processing_an_email_recipient_header
{
static string recipientHeaderToParse1 = @"""Lastname, Firstname"" <firstname_lastname@domain.com>" + "," +
@"<testto@domain.com>, testto1@domain.com, testto2@domain.com" + "," +
@"<testcc@domain.com>, test3@domain.com" + "," +
@"""""Yes, this is valid""""@[emails are hard to parse!]" + "," +
@"First, Last <name@domain.com>, name@domain.com, First Last <name@domain.com>"
;
static string[] results1;
static string[] expectedResults1;
Establish context = () =>
{
expectedResults1 = new string[]
{
@"Lastname",
@"Firstname <firstname_lastname@domain.com>",
@"<testto@domain.com>",
@"testto1@domain.com",
@"testto2@domain.com",
@"<testcc@domain.com>",
@"test3@domain.com",
@"Yes",
@"this is valid@[emails are hard to parse!]",
@"First",
@"Last <name@domain.com>",
@"name@domain.com",
@"First Last <name@domain.com>"
};
};
Because of = () =>
{
results1 = CSVProcessor.GetFieldsFromString(recipientHeaderToParse1);
};
It should_parse_the_email_parts_properly = () => results1.ShouldBeLike(expectedResults1);
}
public List<string> SplitAddresses(string addresses)
{
var result = new List<string>();
var startIndex = 0;
var currentIndex = 0;
var inQuotedString = false;
while (currentIndex < addresses.Length)
{
if (addresses[currentIndex] == QUOTE)
{
inQuotedString = !inQuotedString;
}
// Split if a comma is found, unless inside a quoted string
else if (addresses[currentIndex] == COMMA && !inQuotedString)
{
var address = GetAndCleanSubstring(addresses, startIndex, currentIndex);
if (address.Length > 0)
{
result.Add(address);
}
startIndex = currentIndex + 1;
}
currentIndex++;
}
if (currentIndex > startIndex)
{
var address = GetAndCleanSubstring(addresses, startIndex, currentIndex);
if (address.Length > 0)
{
result.Add(address);
}
}
if (inQuotedString)
throw new FormatException("Unclosed quote in email addresses");
return result;
}
private string GetAndCleanSubstring(string addresses, int startIndex, int currentIndex)
{
var address = addresses.Substring(startIndex, currentIndex - startIndex);
address = address.Trim();
return address;
}
private static IEnumerable<MailAddress> ParseAddress(string addresses)
{
var mailAddressParserClass = Type.GetType("System.Net.Mail.MailAddressParser");
var parseMultipleAddressesMethod = mailAddressParserClass.GetMethod("ParseMultipleAddresses", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
return (IList<MailAddress>)parseMultipleAddressesMethod.Invoke(null, new object[0]);
}
private static IEnumerable<MailAddress> ParseAddress(string addresses)
{
MailMessage message = new MailMessage();
message.To.Add(addresses);
return new List<MailAddress>(message.To); //new List, because we don't want to hold reference on Disposable object
}
public List<MailAddress> ParseAddresses(string field)
{
var tokens = field.Split(',');
var addresses = new List<string>();
var tokenBuffer = new List<string>();
foreach (var token in tokens)
{
tokenBuffer.Add(token);
if (token.IndexOf("@", StringComparison.Ordinal) > -1)
{
addresses.Add( string.Join( ",", tokenBuffer));
tokenBuffer.Clear();
}
}
return addresses.Select(t => new MailAddress(t)).ToList();
}