C# 如何将文本拆分为单词?
如何将文本拆分为单词 示例文本: “哦,你没办法,”猫说,“我们都疯了。我疯了。你疯了。” 该行中的文字是:C# 如何将文本拆分为单词?,c#,.net,C#,.net,如何将文本拆分为单词 示例文本: “哦,你没办法,”猫说,“我们都疯了。我疯了。你疯了。” 该行中的文字是: 噢 你 不能 帮助 那 说 猫 我们是 全部 疯狂的 这里 我是 疯狂的 你是 疯狂的 首先,删除所有特殊字符: var fixedInput = Regex.Replace(input, "[^a-zA-Z0-9% ._]", string.Empty); // This regex doesn't support apostrophe so the extension method
首先,删除所有特殊字符:
var fixedInput = Regex.Replace(input, "[^a-zA-Z0-9% ._]", string.Empty);
// This regex doesn't support apostrophe so the extension method is better
然后将其拆分:
var split = fixedInput.Split(' ');
要获得一个更简单的C#解决方案来删除特殊字符(您可以轻松更改),请添加此扩展方法(我添加了对撇号的支持):
你会惊讶地发现这个扩展方法非常有效(肯定比正则表达式更有效),所以我建议你使用它;)
更新
我同意这是一种只使用英语的方法,但要使其与Unicode兼容,您需要做的就是替换:
(c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
它支持Unicode,.Net还为各种情况提供了
char.IsSymbol
和char.isleterordigit
,为了在@Adam Fridental的答案上添加一个变体,非常好,您可以尝试以下正则表达式:
var text = "'Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.'";
var matches = Regex.Matches(text, @"\w+[^\s]*\w+|\w");
foreach (Match match in matches) {
var word = match.Value;
}
我相信这是最短的正则表达式,将得到所有的话
\w+[^\s]*\w+|\w
您可以尝试使用正则表达式删除不被字母包围的撇号(即单引号),然后使用
Char
静态方法去除所有其他字符。通过首先调用regex,可以保留收缩撇号(例如,不能
),但删除单引号,如'Oh
string myText = "'Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.'";
Regex reg = new Regex("\b[\"']\b");
myText = reg.Replace(myText, "");
string[] listOfWords = RemoveCharacters(myText);
public string[] RemoveCharacters(string input)
{
StringBuilder sb = new StringBuilder();
foreach (char c in input)
{
if (Char.IsLetter(c) || Char.IsWhiteSpace(c) || c == '\'')
sb.Append(c);
}
return sb.ToString().Split(' ');
}
如果不想使用正则表达式对象,可以执行以下操作
string mystring="Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.";
List<string> words=mystring.Replace(",","").Replace(":","").Replace(".","").Split(" ").ToList();
string mystring=“哦,你没办法,”猫说,“我们都疯了。我疯了。你疯了。”;
List words=mystring.Replace(“,”,”).Replace(“:”,”).Replace(“.”,”).Split(“”).ToList();
您仍然需要处理“that”结尾的尾随撇号。在空白处拆分文本,然后修剪标点符号
var text = "'Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.'";
var punctuation = text.Where(Char.IsPunctuation).Distinct().ToArray();
var words = text.Split().Select(x => x.Trim(punctuation));
与示例完全一致。这是解决方案之一,我不使用任何帮助器类或方法
public static List<string> ExtractChars(string inputString) {
var result = new List<string>();
int startIndex = -1;
for (int i = 0; i < inputString.Length; i++) {
var character = inputString[i];
if ((character >= 'a' && character <= 'z') ||
(character >= 'A' && character <= 'Z')) {
if (startIndex == -1) {
startIndex = i;
}
if (i == inputString.Length - 1) {
result.Add(GetString(inputString, startIndex, i));
}
continue;
}
if (startIndex != -1) {
result.Add(GetString(inputString, startIndex, i - 1));
startIndex = -1;
}
}
return result;
}
public static string GetString(string inputString, int startIndex, int endIndex) {
string result = "";
for (int i = startIndex; i <= endIndex; i++) {
result += inputString[i];
}
return result;
}
公共静态列表提取字符(字符串输入字符串){
var result=新列表();
int startIndex=-1;
for(int i=0;i='a'&&character='a'&&character如果要使用“for cycle”检查每个字符,并保存我创建此类的输入字符串中的所有标点符号。方法getSplitSession()返回SentenceSplitResult列表。在此列表中保存了所有单词以及所有标点符号和数字。保存的每个标点符号或数字都是列表中的一项。SentenceSplitResult.isAWord用于检查是否为单词。[对不起,我的英语]
public class SentenceSplitResult
{
public string word;
public bool isAWord;
}
public class StringsHelper
{
private readonly List<SentenceSplitResult> outputList = new List<SentenceSplitResult>();
private readonly string input;
public StringsHelper(string input)
{
this.input = input;
}
public List<SentenceSplitResult> GetSplitSentence()
{
StringBuilder sb = new StringBuilder();
try
{
if (String.IsNullOrEmpty(input)) {
Logger.Log(new ArgumentNullException(), "GetSplitSentence - input is null or empy");
return outputList;
}
bool isAletter = IsAValidLetter(input[0]);
// Each char i checked if is a part of a word.
// If is YES > I can store the char for later
// IF is NO > I Save the word (if exist) and then save the punctuation
foreach (var _char in input)
{
isAletter = IsAValidLetter(_char);
if (isAletter == true)
{
sb.Append(_char);
}
else
{
SaveWord(sb.ToString());
sb.Clear();
SaveANotWord(_char);
}
}
SaveWord(sb.ToString());
}
catch (Exception ex)
{
Logger.Log(ex);
}
return outputList;
}
private static bool IsAValidLetter(char _char)
{
if ((Char.IsPunctuation(_char) == true) || (_char == ' ') || (Char.IsNumber(_char) == true))
{
return false;
}
return true;
}
private void SaveWord(string word)
{
if (String.IsNullOrEmpty(word) == false)
{
outputList.Add(new SentenceSplitResult()
{
isAWord = true,
word = word
});
}
}
private void SaveANotWord(char _char)
{
outputList.Add(new SentenceSplitResult()
{
isAWord = false,
word = _char.ToString()
});
}
公共类语句拆分结果
{
公共字符串;
公共事业;
}
公营架线机
{
私有只读列表outputList=新列表();
私有只读字符串输入;
公共StringsHelper(字符串输入)
{
这个输入=输入;
}
公共列表getSplitSequence()
{
StringBuilder sb=新的StringBuilder();
尝试
{
if(String.IsNullOrEmpty(输入)){
Log(新的ArgumentNullException(),“GetSplitSession-输入为null或empy”);
返回输出列表;
}
bool-isAletter=IsAValidLetter(输入[0]);
//我检查的每个字符是否是单词的一部分。
//如果是“是”,我可以存储字符以备以后使用
//如果是否>我保存单词(如果存在),然后保存标点符号
foreach(输入中的var\u char)
{
isAletter=IsAValidLetter(_char);
如果(isAletter==true)
{
某人附加(_char);
}
其他的
{
SaveWord(sb.ToString());
(某人清楚地);
SaveANotWord(_char);
}
}
SaveWord(sb.ToString());
}
捕获(例外情况除外)
{
Logger.Log(ex);
}
返回输出列表;
}
私有静态bool IsAValidLetter(char\u char)
{
if((Char.ispunchuation(_Char)=true)| |(_Char=='')| |(Char.IsNumber(_Char)==true))
{
返回false;
}
返回true;
}
专用void保存字(字符串字)
{
if(String.IsNullOrEmpty(word)==false)
{
Add(新语句splitresult())
{
是的,
单词
});
}
}
私有void SaveANotWord(char\u char)
{
Add(新语句splitresult())
{
isAWord=false,
word=\u char.ToString()
});
}
我不认为数字是单词的一部分,但我想这取决于OPI geuss。这取决于他,他可以随心所欲地更改正则表达式。我看到的唯一问题是,你的解决方案将删除缩略语中的撇号。例如,将“is not”更改为“isnt”是的,我也看到了,当你写评论的时候,我改进了我的解决方案。看起来他只是在寻找一个快速的字数,s.Split(“”).LengthNice。但正如我在回答中所说,使用正则表达式解决这个问题时,有一件事是有问题的——我已经检查了所需的时间,并且我在回答中编写的扩展方法比正则表达式解析快~X7。感谢对它们进行分析,我今天学到了一些新的东西:)你有我的投票权。我会继续争论(这是我的天性)Regex可以降低代码复杂度,但是你的方法也很短,而且大多数人觉得Regex不如我友好。哦,好吧,我同意Refex很棒。
var text = "'Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.'";
var punctuation = text.Where(Char.IsPunctuation).Distinct().ToArray();
var words = text.Split().Select(x => x.Trim(punctuation));
public static List<string> ExtractChars(string inputString) {
var result = new List<string>();
int startIndex = -1;
for (int i = 0; i < inputString.Length; i++) {
var character = inputString[i];
if ((character >= 'a' && character <= 'z') ||
(character >= 'A' && character <= 'Z')) {
if (startIndex == -1) {
startIndex = i;
}
if (i == inputString.Length - 1) {
result.Add(GetString(inputString, startIndex, i));
}
continue;
}
if (startIndex != -1) {
result.Add(GetString(inputString, startIndex, i - 1));
startIndex = -1;
}
}
return result;
}
public static string GetString(string inputString, int startIndex, int endIndex) {
string result = "";
for (int i = startIndex; i <= endIndex; i++) {
result += inputString[i];
}
return result;
}
public class SentenceSplitResult
{
public string word;
public bool isAWord;
}
public class StringsHelper
{
private readonly List<SentenceSplitResult> outputList = new List<SentenceSplitResult>();
private readonly string input;
public StringsHelper(string input)
{
this.input = input;
}
public List<SentenceSplitResult> GetSplitSentence()
{
StringBuilder sb = new StringBuilder();
try
{
if (String.IsNullOrEmpty(input)) {
Logger.Log(new ArgumentNullException(), "GetSplitSentence - input is null or empy");
return outputList;
}
bool isAletter = IsAValidLetter(input[0]);
// Each char i checked if is a part of a word.
// If is YES > I can store the char for later
// IF is NO > I Save the word (if exist) and then save the punctuation
foreach (var _char in input)
{
isAletter = IsAValidLetter(_char);
if (isAletter == true)
{
sb.Append(_char);
}
else
{
SaveWord(sb.ToString());
sb.Clear();
SaveANotWord(_char);
}
}
SaveWord(sb.ToString());
}
catch (Exception ex)
{
Logger.Log(ex);
}
return outputList;
}
private static bool IsAValidLetter(char _char)
{
if ((Char.IsPunctuation(_char) == true) || (_char == ' ') || (Char.IsNumber(_char) == true))
{
return false;
}
return true;
}
private void SaveWord(string word)
{
if (String.IsNullOrEmpty(word) == false)
{
outputList.Add(new SentenceSplitResult()
{
isAWord = true,
word = word
});
}
}
private void SaveANotWord(char _char)
{
outputList.Add(new SentenceSplitResult()
{
isAWord = false,
word = _char.ToString()
});
}