C# 将波特词干提取算法的输出保存到文本文件
我有c#中的波特算法代码,有人能告诉我如何将该代码的输出保存到txt文件中吗?我还需要输入文件名或其内容吗 使用制度; 使用System.IO 名称空间搬运工{ /* CSharp中的Porter词干分析器,基于Java端口。原始论文在 波特,1980,后缀剥离算法,程序,第14卷, 第3号,第130-137页, 另见http://www.tartarus.org/~martin/PorterStemmer 历史: 第1版 Bug 1(Gonzalo Parra于1999年10月16日报告)修复如下标记。 单词‘aed’、‘eed’、‘oed’将第3步的k保留在‘a’处,而b[k-1] 然后在b的边界之外。 第2版 同样地, Bug 2(Steve Dyrdahl 22/2/00报告)修复如下标记。 “ion”本身会在步骤5中的“ion”测试中留下j=-1,并且 然后b[j]在b的边界之外。 第3版 根据许多有益的建议,对4/9/00进行了大幅修订 来自Quiotix公司的Brian Goetz(brian@quiotix.com). 发布4 */ /** *词干分析器,实现波特词干算法 * *词干分析器类将单词转换为词根形式 *word可以一次提供一个字符(通过调用add()),也可以一次提供一个字符 *通过调用各种stem(something)方法之一。 */ 类词干分析器{ 私有字符[]b; 私有整数i,/*偏移到b*/ i_end,/*偏移到带词干单词的末尾*/ j、 k; 私有静态int INC=50; /*b增大的尺寸单位*/ 公共词干分析器(){ b=新字符[INC]; i=0; i_end=0; } /** *在词干中添加字符。完成后 *添加字符后,可以调用stem(void)来对单词进行词干处理。 */ 公共无效添加(字符ch){ 如果(i==b.长度){ char[]new_b=新字符[i+INC]; 对于(int c=0;c=b.Length){ char[]new_b=新字符[i+wLen+INC]; 对于(int c=0;cj)返回n; 如果(!cons(i))中断;i++; } i++; while(true){ while(true){ 如果(i>j)返回n; 如果(cons(i))中断; i++; } i++; n++; while(true){ 如果(i>j)返回n; 如果(!cons(i))中断; i++; } i++; } } /*元音项()为真0,…j包含一个元音*/ 私有布尔元音项(){ int i; 对于(i=0;i=0) setto(s); } /*step1()去掉复数和-ed或-ing。 爱抚->爱抚 小马->小马 领带->钛 爱抚->爱抚 猫->猫 饲料->饲料 同意->同意 禁用->禁用 垫子->垫子 交配->交配 会议->会议 铣削->铣削 混乱->混乱 会议->会面 */ 私有无效步骤1(){ 如果(b[k]=“s”){ 如果(结束(“sses”)) k-=2; 否则,如果(结束) setto(“i”); else如果(b[k-1]!='s') k--; } 如果(结束(“eed”)){ 如果(m()>0) k--; }else if((结束(“ed”)| |结束(“ing”)&&vowerinstem()){ k=j; 如果(结束时) setto(“ate”); 否则,如果(结束(“bl”)) 设置为(“可编程”); 否则,如果(结束(“iz”)) 设置为(“ize”); else if(c(k)){ k--; int ch=b[k]; 如果(ch='l'| | ch='s'| | ch='z') k++; } 如果(m()==1&&cvc(k))设置为(“e”); } } /*当词干中有另一个元音时,step2()将结束语y变为i*/ 普里瓦C# 将波特词干提取算法的输出保存到文本文件,c#,algorithm,stemming,porter-stemmer,C#,Algorithm,Stemming,Porter Stemmer,我有c#中的波特算法代码,有人能告诉我如何将该代码的输出保存到txt文件中吗?我还需要输入文件名或其内容吗 使用制度; 使用System.IO 名称空间搬运工{ /* CSharp中的Porter词干分析器,基于Java端口。原始论文在 波特,1980,后缀剥离算法,程序,第14卷, 第3号,第130-137页, 另见http://www.tartarus.org/~martin/PorterStemmer 历史: 第1版 Bug 1(Gonzalo Parra于1999年10月16日报告)修复
Porter stemmer in CSharp, based on the Java port. The original paper is in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,
See also http://www.tartarus.org/~martin/PorterStemmer
History:
Release 1
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.
Release 2
Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.
Release 3
Considerably revised 4/9/00 in the light of many helpful suggestions
from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
Release 4
*/
/**
* Stemmer, implementing the Porter Stemming Algorithm
*
* The Stemmer class transforms a word into its root form. The input
* word can be provided a character at time (by calling add()), or at once
* by calling one of the various stem(something) methods.
*/
class Stemmer {
private char[] b;
private int i, /* offset into b */
i_end, /* offset to end of stemmed word */
j, k;
private static int INC = 50;
/* unit of size whereby b is increased */
public Stemmer() {
b = new char[INC];
i = 0;
i_end = 0;
}
/**
* Add a character to the word being stemmed. When you are finished
* adding characters, you can call stem(void) to stem the word.
*/
public void add(char ch) {
if (i == b.Length) {
char[] new_b = new char[i+INC];
for (int c = 0; c < i; c++)
new_b[c] = b[c];
b = new_b;
}
b[i++] = ch;
}
/** Adds wLen characters to the word being stemmed contained in a portion
* of a char[] array. This is like repeated calls of add(char ch), but
* faster.
*/
public void add(char[] w, int wLen) {
if (i+wLen >= b.Length) {
char[] new_b = new char[i+wLen+INC];
for (int c = 0; c < i; c++)
new_b[c] = b[c];
b = new_b;
}
for (int c = 0; c < wLen; c++)
b[i++] = w[c];
}
/**
* After a word has been stemmed, it can be retrieved by toString(),
* or a reference to the internal buffer can be retrieved by getResultBuffer
* and getResultLength (which is generally more efficient.)
*/
public override string ToString() {
return new String(b,0,i_end);
}
/**
* Returns the length of the word resulting from the stemming process.
*/
public int getResultLength() {
return i_end;
}
/**
* Returns a reference to a character buffer containing the results of
* the stemming process. You also need to consult getResultLength()
* to determine the length of the result.
*/
public char[] getResultBuffer() {
return b;
}
/* cons(i) is true <=> b[i] is a consonant. */
private bool cons(int i) {
switch (b[i]) {
case 'a': case 'e': case 'i': case 'o': case 'u': return false;
case 'y': return (i==0) ? true : !cons(i-1);
default: return true;
}
}
/* m() measures the number of consonant sequences between 0 and j. if c is
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
*/
private int m() {
int n = 0;
int i = 0;
while(true) {
if (i > j) return n;
if (! cons(i)) break; i++;
}
i++;
while(true) {
while(true) {
if (i > j) return n;
if (cons(i)) break;
i++;
}
i++;
n++;
while(true) {
if (i > j) return n;
if (! cons(i)) break;
i++;
}
i++;
}
}
/* vowelinstem() is true <=> 0,...j contains a vowel */
private bool vowelinstem() {
int i;
for (i = 0; i <= j; i++)
if (! cons(i))
return true;
return false;
}
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
private bool doublec(int j) {
if (j < 1)
return false;
if (b[j] != b[j-1])
return false;
return cons(j);
}
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
and also if the second c is not w,x or y. this is used when trying to
restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
*/
private bool cvc(int i) {
if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2))
return false;
int ch = b[i];
if (ch == 'w' || ch == 'x' || ch == 'y')
return false;
return true;
}
private bool ends(String s) {
int l = s.Length;
int o = k-l+1;
if (o < 0)
return false;
char[] sc = s.ToCharArray();
for (int i = 0; i < l; i++)
if (b[o+i] != sc[i])
return false;
j = k-l;
return true;
}
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
k. */
private void setto(String s) {
int l = s.Length;
int o = j+1;
char[] sc = s.ToCharArray();
for (int i = 0; i < l; i++)
b[o+i] = sc[i];
k = j+l;
}
/* r(s) is used further down. */
private void r(String s) {
if (m() > 0)
setto(s);
}
/* step1() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
ties -> ti
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
*/
private void step1() {
if (b[k] == 's') {
if (ends("sses"))
k -= 2;
else if (ends("ies"))
setto("i");
else if (b[k-1] != 's')
k--;
}
if (ends("eed")) {
if (m() > 0)
k--;
} else if ((ends("ed") || ends("ing")) && vowelinstem()) {
k = j;
if (ends("at"))
setto("ate");
else if (ends("bl"))
setto("ble");
else if (ends("iz"))
setto("ize");
else if (doublec(k)) {
k--;
int ch = b[k];
if (ch == 'l' || ch == 's' || ch == 'z')
k++;
}
else if (m() == 1 && cvc(k)) setto("e");
}
}
/* step2() turns terminal y to i when there is another vowel in the stem. */
private void step2() {
if (ends("y") && vowelinstem())
b[k] = 'i';
}
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
-ation) maps to -ize etc. note that the string before the suffix must give
m() > 0. */
private void step3() {
if (k == 0)
return;
/* For Bug 1 */
switch (b[k-1]) {
case 'a':
if (ends("ational")) { r("ate"); break; }
if (ends("tional")) { r("tion"); break; }
break;
case 'c':
if (ends("enci")) { r("ence"); break; }
if (ends("anci")) { r("ance"); break; }
break;
case 'e':
if (ends("izer")) { r("ize"); break; }
break;
case 'l':
if (ends("bli")) { r("ble"); break; }
if (ends("alli")) { r("al"); break; }
if (ends("entli")) { r("ent"); break; }
if (ends("eli")) { r("e"); break; }
if (ends("ousli")) { r("ous"); break; }
break;
case 'o':
if (ends("ization")) { r("ize"); break; }
if (ends("ation")) { r("ate"); break; }
if (ends("ator")) { r("ate"); break; }
break;
case 's':
if (ends("alism")) { r("al"); break; }
if (ends("iveness")) { r("ive"); break; }
if (ends("fulness")) { r("ful"); break; }
if (ends("ousness")) { r("ous"); break; }
break;
case 't':
if (ends("aliti")) { r("al"); break; }
if (ends("iviti")) { r("ive"); break; }
if (ends("biliti")) { r("ble"); break; }
break;
case 'g':
if (ends("logi")) { r("log"); break; }
break;
default :
break;
}
}
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
private void step4() {
switch (b[k]) {
case 'e':
if (ends("icate")) { r("ic"); break; }
if (ends("ative")) { r(""); break; }
if (ends("alize")) { r("al"); break; }
break;
case 'i':
if (ends("iciti")) { r("ic"); break; }
break;
case 'l':
if (ends("ical")) { r("ic"); break; }
if (ends("ful")) { r(""); break; }
break;
case 's':
if (ends("ness")) { r(""); break; }
break;
}
}
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
private void step5() {
if (k == 0)
return;
/* for Bug 1 */
switch ( b[k-1] ) {
case 'a':
if (ends("al")) break; return;
case 'c':
if (ends("ance")) break;
if (ends("ence")) break; return;
case 'e':
if (ends("er")) break; return;
case 'i':
if (ends("ic")) break; return;
case 'l':
if (ends("able")) break;
if (ends("ible")) break; return;
case 'n':
if (ends("ant")) break;
if (ends("ement")) break;
if (ends("ment")) break;
/* element etc. not stripped before the m */
if (ends("ent")) break; return;
case 'o':
if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break; return;
/* takes care of -ous */
case 's':
if (ends("ism")) break; return;
case 't':
if (ends("ate")) break;
if (ends("iti")) break; return;
case 'u':
if (ends("ous")) break; return;
case 'v':
if (ends("ive")) break; return;
case 'z':
if (ends("ize")) break; return;
default:
return;
}
if (m() > 1)
k = j;
}
/* step6() removes a final -e if m() > 1. */
private void step6() {
j = k;
if (b[k] == 'e') {
int a = m();
if (a > 1 || a == 1 && !cvc(k-1))
k--;
}
if (b[k] == 'l' && doublec(k) && m() > 1)
k--;
}
/** Stem the word placed into the Stemmer buffer through calls to add().
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public void stem() {
k = i - 1;
if (k > 1) {
step1();
step2();
step3();
step4();
step5();
step6();
}
i_end = k+1;
i = 0;
}
/** Test program for demonstrating the Stemmer. It reads text from a
* a list of files, stems each word, and writes the result to standard
* output. Note that the word stemmed is expected to be in lower case:
* forcing lower case must be done outside the Stemmer class.
* Usage: Stemmer file-name file-name ...
*/
public static void Main( String[] args ) {
if ( args.Length == 0 ) {
Console.WriteLine( "Usage: Stemmer <input file>" );
return;
}
char[] w = new char[501];
Stemmer s = new Stemmer();
for (int i = 0; i < args.Length; i++)
try {
FileStream _in = new FileStream( args[i], FileMode.Open, FileAccess.Read );
try {
while(true) {
int ch = _in.ReadByte();
if ( Char.IsLetter((char) ch)) {
int j = 0;
while(true) {
ch = Char.ToLower((char) ch);
w[j] = (char) ch;
if (j < 500)
j++;
ch = _in.ReadByte();
if (!Char.IsLetter((char) ch)) {
/* to test add(char ch) */
for (int c = 0; c < j; c++)
s.add(w[c]);
/* or, to test add(char[] w, int j) */
/* s.add(w, j); */
s.stem();
String u;
/* and now, to test toString() : */
u = s.ToString();
/* to test getResultBuffer(), getResultLength() : */
/* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
Console.Write(u);
break;
}
}
}
if (ch < 0)
break;
Console.Write((char)ch);
}
} catch (IOException ) {
Console.WriteLine("error reading " + args[i]);
break;
}
} catch (FileNotFoundException ) {
Console.WriteLine("file " + args[i] + " not found");
break;
}
}
}
public static void DoStemmer(String[] args)
{
String[] Final = new string[args.Length];
if (args.Length == 0)
{
return ;
}
char[] w = new char[501];
Stemmer s = new Stemmer();
for (int i = 0; i < args.Length; i++)
{
//try
//{
byte[] array = Encoding.ASCII.GetBytes(args[i]);
//using (FileStream _in = new FileStream(args[i], FileMode.Open, FileAccess.Read))
//try
//{
//Boolean Flag = false;
int Size=array.Length;
int Count = 0;
int j = 0;
Array.Clear(w, 0, w.Length);
foreach (byte element in array)
{
Count++;
int ch = element;
if (Char.IsLetter((char)ch))
{
//while (true)
//{
ch = Char.ToLower((char)ch);
w[j] = (char)ch;
if (j < 500)
j++;
// ch = _in.ReadByte();
if (Count==Size)
{
/* to test add(char ch) */
for (int c = 0; c < j; c++)
s.add(w[c]);
/* or, to test add(char[] w, int j) */
/* s.add(w, j); */
s.stem();
String u;
/* and now, to test toString() : */
u = s.ToString();
/* to test getResultBuffer(), getResultLength() : */
/* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
Final[i] = u;
break;
}
//}
}
if (ch < 0)
break;
Final[i] = ch.ToString();
}
// }
// catch (IOException)
// {
// MessageBox.Show("o");
// break;
// }
//}
//catch (FileNotFoundException)
//{
// break;
// MessageBox.Show("no");
//}
}
FileStream sw = new FileStream(@"D:\patttt.txt", FileMode.CreateNew, FileAccess.Write, FileShare.Read);
StreamWriter fs = new StreamWriter(sw);
for (int jj = 0; jj < Final.Length; jj++)
fs.WriteLine(Final[jj]);
}