C# 处理任意分隔符/转义字符的最佳算法是什么?
我有点惊讶,网上没有这方面的信息,我一直发现问题比我想象的更棘手 规则如下:C# 处理任意分隔符/转义字符的最佳算法是什么?,c#,regex,algorithm,C#,Regex,Algorithm,我有点惊讶,网上没有这方面的信息,我一直发现问题比我想象的更棘手 规则如下: 您从分隔/转义数据开始,将其拆分为一个数组 分隔符是一个任意字符 转义字符是一个任意字符 分隔符和转义字符都可能出现在数据中 Regex很好,但是一个好的性能解决方案是最好的 编辑:可以忽略空元素(包括前导或结尾分隔符) 代码签名(基本上用C#表示) 问题最棘手的部分当然是转义的连续转义字符大小写,因为(调用/转义字符和分隔符):////,=// 我是否遗漏了在网络上或其他SO问题中处理的内容?如果没有,那就用你的大脑
我是否遗漏了在网络上或其他SO问题中处理的内容?如果没有,那就用你的大脑袋工作吧。。。我认为,为了公众利益,这个问题是一件好事。我自己也在研究,但还没有一个好的解决方案。您正在寻找类似“字符串标记器”的东西。我很快发现了一个类似的例子。或者看看。这种标记器在FSM中的实现相当简单 您确实需要做一些决定(例如,我如何处理前导分隔符?去除或发出空标记)
以下是一个抽象版本,它忽略前导和多个分隔符,并且不允许转义换行符:
state(input) action
========================
BEGIN(*): token.clear(); state=START;
END(*): return;
*(\n\0): token.emit(); state=END;
START(DELIMITER): ; // NB: the input is *not* added to the token!
START(ESCAPE): state=ESC; // NB: the input is *not* added to the token!
START(*): token.append(input); state=NORM;
NORM(DELIMITER): token.emit(); token.clear(); state=START;
NORM(ESCAPE): state=ESC; // NB: the input is *not* added to the token!
NORM(*): token.append(input);
ESC(*): token.append(input); state=NORM;
这种实现的优点是可以自然地处理连续的excape,并且可以很容易地进行扩展,以赋予更多转义序列特殊的含义(即添加一个规则,如
ESC(t)token.appeand(TAB)
。简单的状态机通常是最简单、最快速的方法。Python中的示例:
def extract(input, delim, escape):
# states
parsing = 0
escaped = 1
state = parsing
found = []
parsed = ""
for c in input:
if state == parsing:
if c == delim:
found.append(parsed)
parsed = ""
elif c == escape:
state = escaped
else:
parsed += c
else: # state == escaped
parsed += c
state = parsing
if parsed:
found.append(parsed)
return found
void smartSplit(字符串常量和文本、字符delim、字符esc、向量和标记)
{
枚举状态{NORMAL,IN_ESC};
状态=正常;
弦框;
对于(size_t i=0;i这是我在C中移植的函数#
publicstaticvoidsmartsplit(字符串文本、字符delim、字符esc、参考列表listToBuild)
{
bool currentlyseived=false;
StringBuilder片段=新的StringBuilder();
for(int i=0;i0)
{
添加(fragment.ToString());
fragment.Remove(0,fragment.Length);
}
}
else如果(c==esc)
CurrentlyEscape=true;
其他的
片段。附加(c);
}
}
如果(fragment.Length>0)
{
添加(fragment.ToString());
}
}
希望这对将来的人有所帮助。感谢KenE为我指明了正确的方向。私有静态字符串[]拆分(字符串输入、字符分隔符、字符转义、bool removempty)
private static string[] Split(string input, char delimiter, char escapeChar, bool removeEmpty)
{
if (input == null)
{
return new string[0];
}
char[] specialChars = new char[]{delimiter, escapeChar};
var tokens = new List<string>();
var token = new StringBuilder();
for (int i = 0; i < input.Length; i++)
{
var c = input[i];
if (c.Equals(escapeChar))
{
if (i >= input.Length - 1)
{
throw new ArgumentException("Uncompleted escape sequence has been encountered at the end of the input");
}
var nextChar = input[i + 1];
if (nextChar != escapeChar && nextChar != delimiter)
{
throw new ArgumentException("Unknown escape sequence has been encountered: " + c + nextChar);
}
token.Append(nextChar);
i++;
}
else if (c.Equals(delimiter))
{
if (!removeEmpty || token.Length > 0)
{
tokens.Add(token.ToString());
token.Length = 0;
}
}
else
{
var index = input.IndexOfAny(specialChars, i);
if (index < 0)
{
token.Append(c);
}
else
{
token.Append(input.Substring(i, index - i));
i = index - 1;
}
}
}
if (!removeEmpty || token.Length > 0)
{
tokens.Add(token.ToString());
}
return tokens.ToArray();
}
{
如果(输入==null)
{
返回新字符串[0];
}
char[]specialChars=new char[]{分隔符,escapeChar};
var tokens=新列表();
var token=新的StringBuilder();
for(int i=0;i=input.Length-1)
{
抛出新ArgumentException(“在输入端遇到未完成的转义序列”);
}
var nextChar=输入[i+1];
if(nextChar!=escapeChar&&nextChar!=分隔符)
{
抛出新ArgumentException(“遇到未知转义序列:“+c+nextChar”);
}
token.Append(nextChar);
i++;
}
else if(c.Equals(分隔符))
{
如果(!removeMpty | | token.Length>0)
{
添加(token.ToString());
token.Length=0;
}
}
其他的
{
var指数=输入指数(特别指数,i);
如果(指数<0)
{
附加标记(c);
}
其他的
{
Append(input.Substring(i,index-i));
i=指数-1;
}
}
}
如果(!removeMpty | | token.Length>0)
{
添加(token.ToString());
}
返回令牌。ToArray();
}
以下是一种更为惯用且可读的方法:
public IEnumerable<string> SplitAndUnescape(
string encodedString,
char separator,
char escape)
{
var inEscapeSequence = false;
var currentToken = new StringBuilder();
foreach (var currentCharacter in encodedString)
if (inEscapeSequence)
{
currentToken.Append(currentCharacter);
inEscapeSequence = false;
}
else
if (currentCharacter == escape)
inEscapeSequence = true;
else
if (currentCharacter == separator)
{
yield return currentToken.ToString();
currentToken.Clear();
}
else
currentToken.Append(currentCharacter);
yield return currentToken.ToString();
}
public IEnumerable SplitAndUnescape(
字符串编码字符串,
煤焦分离器,
字符逃逸)
{
var inEscapeSequence=false;
var currentToken=新的StringBuilder();
foreach(编码器字符串中的var currentCharacter)
if(inEscapeSequence)
{
currentToken.Append(currentCharacter);
inEscapeSequence=false;
}
其他的
if(currentCharacter==转义)
inEscapeSequence=true;
其他的
if(currentCharacter==分隔符)
{
产生返回currentToken.ToString();
currentToken.Clear();
}
其他的
currentToken.Append(currentCharacter);
产生返回currentToken.ToString();
}
请注意,这并不重要
public static void smartSplit(string text, char delim, char esc, ref List<string> listToBuild)
{
bool currentlyEscaped = false;
StringBuilder fragment = new StringBuilder();
for (int i = 0; i < text.Length; i++)
{
char c = text[i];
if (currentlyEscaped)
{
fragment.Append(c);
currentlyEscaped = false;
}
else
{
if (c == delim)
{
if (fragment.Length > 0)
{
listToBuild.Add(fragment.ToString());
fragment.Remove(0, fragment.Length);
}
}
else if (c == esc)
currentlyEscaped = true;
else
fragment.Append(c);
}
}
if (fragment.Length > 0)
{
listToBuild.Add(fragment.ToString());
}
}
private static string[] Split(string input, char delimiter, char escapeChar, bool removeEmpty)
{
if (input == null)
{
return new string[0];
}
char[] specialChars = new char[]{delimiter, escapeChar};
var tokens = new List<string>();
var token = new StringBuilder();
for (int i = 0; i < input.Length; i++)
{
var c = input[i];
if (c.Equals(escapeChar))
{
if (i >= input.Length - 1)
{
throw new ArgumentException("Uncompleted escape sequence has been encountered at the end of the input");
}
var nextChar = input[i + 1];
if (nextChar != escapeChar && nextChar != delimiter)
{
throw new ArgumentException("Unknown escape sequence has been encountered: " + c + nextChar);
}
token.Append(nextChar);
i++;
}
else if (c.Equals(delimiter))
{
if (!removeEmpty || token.Length > 0)
{
tokens.Add(token.ToString());
token.Length = 0;
}
}
else
{
var index = input.IndexOfAny(specialChars, i);
if (index < 0)
{
token.Append(c);
}
else
{
token.Append(input.Substring(i, index - i));
i = index - 1;
}
}
}
if (!removeEmpty || token.Length > 0)
{
tokens.Add(token.ToString());
}
return tokens.ToArray();
}
public IEnumerable<string> SplitAndUnescape(
string encodedString,
char separator,
char escape)
{
var inEscapeSequence = false;
var currentToken = new StringBuilder();
foreach (var currentCharacter in encodedString)
if (inEscapeSequence)
{
currentToken.Append(currentCharacter);
inEscapeSequence = false;
}
else
if (currentCharacter == escape)
inEscapeSequence = true;
else
if (currentCharacter == separator)
{
yield return currentToken.ToString();
currentToken.Clear();
}
else
currentToken.Append(currentCharacter);
yield return currentToken.ToString();
}