Java 在WEKA中运行StringToWordVector过滤器_Java_Filter_Machine Learning_Weka_Arff

Java 在WEKA中运行StringToWordVector过滤器

java filter machine-learning

Java 在WEKA中运行StringToWordVector过滤器,java,filter,machine-learning,weka,arff,Java,Filter,Machine Learning,Weka,Arff,我正在Java上使用WEKA的API开发一个应用程序。运行StringToOrdVector过滤器（转换字符串类型属性）到my input.arff文件时，如下所示： @relation Instantzien_Bektorea @attribute 5_Ainf_Lema string @attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}

我正在Java上使用WEKA的API开发一个应用程序。运行StringToOrdVector过滤器（转换字符串类型属性）到my input.arff文件时，如下所示：

    @relation Instantzien_Bektorea

    @attribute 5_Ainf_Lema string
    @attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}
    @attribute 7_Arg_Pos_AzpiKat {ARR,IZB,LIB,ZKI,GAL,SIN,ADK,ADP,FAK,ERKARR,ERKIND,NOLARR,NOLGAL,DZH,BAN,ORD,DZG,ORO,PERARR,PERIND,IZGMGB,IZGGAL,BIH,ELK,JOK,JNT,HUTSA}
    @attribute 8_Arg_Kasua {abl,abu,abz,ala,soz,dat,des,erg,gel,gen,ine,ins,mot,abs,par,pro,bnk,desk,aurk,bald,emen,erlt,espl,haut,helb,kaus,konpl,kont,denb,mod,mos,ondo,zhg,neg,gen_post_ine,gen_post,gen_post_abs,ala_des,soz_post_ala,zero_post_abl,-}
    @attribute 9_Argumentuaren_FSint {-,subj,obj}
    @attribute 10_Arg_Posizioa {Aurretik,Atzetik}
    @attribute 11_Dist_HKop numeric
    @attribute 12_Dist_ArgKop numeric
    @attribute 13_Framea string
    @attribute 15_Frame_Unekoa string
    @attribute Klasea {arg0,arg1,arg2,argM*LOC,argM*TMP,argM*MNR,argM*Cause,argM*ADV,argM*PRP,argM*-,argM*NEG,argM*DIS}

    @data
    eta_gero,LOT,ARR,denb,-,Aurretik,999,1,argM_PRED_arg1,ARGM_PRED_arg1,argM*TMP
    Ainf_Lema,ADI,SIN,mod,-,Aurretik,1,1,argM_arg0_arg1_PRED,argM_arg0_ARG1_PRED,arg1
    Ainf_Lema,IZE,ARR,abs,subj,Aurretik,999,2,arg0_argM_arg1_PRED,ARG0_argM_arg1_PRED,arg0
...

@relation 'Train_Instantzien_Bektorea-weka.filters.unsupervised.attribute.StringToWordVector-R1,9,10-W1000-prune-rate-1.0-N0-stemmerweka.core.stemmers.NullStemmer-M1-tokenizerweka.core.tokenizers.WordTokenizer -delimiters \" \\r\\n\\t.,;:\\\'\\\"()?!\"'

@attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}
@attribute 7_Arg_Pos_AzpiKat {ARR,IZB,LIB,ZKI,GAL,SIN,ADK,ADP,FAK,ERKARR,ERKIND,NOLARR,NOLGAL,DZH,BAN,ORD,DZG,ORO,PERARR,PERIND,IZGMGB,IZGGAL,BIH,ELK,JOK,JNT,HUTSA}
@attribute 8_Arg_Kasua {abl,abu,abz,ala,soz,dat,des,erg,gel,gen,ine,ins,mot,abs,par,pro,bnk,desk,aurk,bald,emen,erlt,espl,haut,helb,kaus,konpl,kont,denb,mod,mos,ondo,zhg,neg,gen_post_ine,gen_post,gen_post_abs,ala_des,soz_post_ala,zero_post_abl,-}
@attribute 9_Argumentuaren_FSint {-,subj,obj}
@attribute 10_Arg_Posizioa {Aurretik,Atzetik}
@attribute 11_Dist_HKop numeric
@attribute 12_Dist_ArgKop numeric
@attribute Klasea {arg0,arg1,arg2,argM*LOC,argM*TMP,argM*MNR,argM*Cause,argM*ADV,argM*PRP,argM*-,argM*NEG,argM*DIS}
@attribute ARG0_PRED_arg1 numeric
@attribute ARG0_arg1_PRED numeric
@attribute ARG0_arg1_PRED_arg1_argM numeric
@attribute ARG0_arg1_PRED_argM numeric
@attribute ARG0_arg1_PRED_argM_argM numeric
@attribute ARG0_argM_PRED numeric

...

@attribute argM_PRED_ARG1_argM_argM numeric
@attribute argM_PRED numeric
@attribute argM_PRED_arg1_ARGM numeric

...

@attribute ARGM_argM_PRED_arg1_argM numeric
@attribute arg0_ARGM_arg1_PRED numeric
@attribute arg0_ARGM_arg1_PRED_argM numeric
@attribute arg0_arg1_PRED_ARGM_argM numeric
@attribute eta_gero numeric
@attribute gaur numeric


@data
{0 LOT,2 denb,5 999,6 1,7 argM*TMP,90 1,162 1,197 1}
{0 ADI,1 SIN,2 mod,5 1,6 1,7 arg1,19 1,42 1,93 1}
{2 abs,3 subj,5 999,6 2,16 1,19 1,29 1}

      // StringToWordVector filter
      String[] options = new String[1];
      options[0] = "-R <1,9,10>";                                    
      StringToWordVector filter = new StringToWordVector(); 
      filter.setOptions(options);                          
      filter.setInputFormat(input.arff);                         
      Instances output_inst = Filter.useFilter(input_inst, filter);

我得到了另一组在output.arrf中编写的实例，如下所示：

    @relation Instantzien_Bektorea

    @attribute 5_Ainf_Lema string
    @attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}
    @attribute 7_Arg_Pos_AzpiKat {ARR,IZB,LIB,ZKI,GAL,SIN,ADK,ADP,FAK,ERKARR,ERKIND,NOLARR,NOLGAL,DZH,BAN,ORD,DZG,ORO,PERARR,PERIND,IZGMGB,IZGGAL,BIH,ELK,JOK,JNT,HUTSA}
    @attribute 8_Arg_Kasua {abl,abu,abz,ala,soz,dat,des,erg,gel,gen,ine,ins,mot,abs,par,pro,bnk,desk,aurk,bald,emen,erlt,espl,haut,helb,kaus,konpl,kont,denb,mod,mos,ondo,zhg,neg,gen_post_ine,gen_post,gen_post_abs,ala_des,soz_post_ala,zero_post_abl,-}
    @attribute 9_Argumentuaren_FSint {-,subj,obj}
    @attribute 10_Arg_Posizioa {Aurretik,Atzetik}
    @attribute 11_Dist_HKop numeric
    @attribute 12_Dist_ArgKop numeric
    @attribute 13_Framea string
    @attribute 15_Frame_Unekoa string
    @attribute Klasea {arg0,arg1,arg2,argM*LOC,argM*TMP,argM*MNR,argM*Cause,argM*ADV,argM*PRP,argM*-,argM*NEG,argM*DIS}

    @data
    eta_gero,LOT,ARR,denb,-,Aurretik,999,1,argM_PRED_arg1,ARGM_PRED_arg1,argM*TMP
    Ainf_Lema,ADI,SIN,mod,-,Aurretik,1,1,argM_arg0_arg1_PRED,argM_arg0_ARG1_PRED,arg1
    Ainf_Lema,IZE,ARR,abs,subj,Aurretik,999,2,arg0_argM_arg1_PRED,ARG0_argM_arg1_PRED,arg0
...

@relation 'Train_Instantzien_Bektorea-weka.filters.unsupervised.attribute.StringToWordVector-R1,9,10-W1000-prune-rate-1.0-N0-stemmerweka.core.stemmers.NullStemmer-M1-tokenizerweka.core.tokenizers.WordTokenizer -delimiters \" \\r\\n\\t.,;:\\\'\\\"()?!\"'

@attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}
@attribute 7_Arg_Pos_AzpiKat {ARR,IZB,LIB,ZKI,GAL,SIN,ADK,ADP,FAK,ERKARR,ERKIND,NOLARR,NOLGAL,DZH,BAN,ORD,DZG,ORO,PERARR,PERIND,IZGMGB,IZGGAL,BIH,ELK,JOK,JNT,HUTSA}
@attribute 8_Arg_Kasua {abl,abu,abz,ala,soz,dat,des,erg,gel,gen,ine,ins,mot,abs,par,pro,bnk,desk,aurk,bald,emen,erlt,espl,haut,helb,kaus,konpl,kont,denb,mod,mos,ondo,zhg,neg,gen_post_ine,gen_post,gen_post_abs,ala_des,soz_post_ala,zero_post_abl,-}
@attribute 9_Argumentuaren_FSint {-,subj,obj}
@attribute 10_Arg_Posizioa {Aurretik,Atzetik}
@attribute 11_Dist_HKop numeric
@attribute 12_Dist_ArgKop numeric
@attribute Klasea {arg0,arg1,arg2,argM*LOC,argM*TMP,argM*MNR,argM*Cause,argM*ADV,argM*PRP,argM*-,argM*NEG,argM*DIS}
@attribute ARG0_PRED_arg1 numeric
@attribute ARG0_arg1_PRED numeric
@attribute ARG0_arg1_PRED_arg1_argM numeric
@attribute ARG0_arg1_PRED_argM numeric
@attribute ARG0_arg1_PRED_argM_argM numeric
@attribute ARG0_argM_PRED numeric

...

@attribute argM_PRED_ARG1_argM_argM numeric
@attribute argM_PRED numeric
@attribute argM_PRED_arg1_ARGM numeric

...

@attribute ARGM_argM_PRED_arg1_argM numeric
@attribute arg0_ARGM_arg1_PRED numeric
@attribute arg0_ARGM_arg1_PRED_argM numeric
@attribute arg0_arg1_PRED_ARGM_argM numeric
@attribute eta_gero numeric
@attribute gaur numeric


@data
{0 LOT,2 denb,5 999,6 1,7 argM*TMP,90 1,162 1,197 1}
{0 ADI,1 SIN,2 mod,5 1,6 1,7 arg1,19 1,42 1,93 1}
{2 abs,3 subj,5 999,6 2,16 1,19 1,29 1}

      // StringToWordVector filter
      String[] options = new String[1];
      options[0] = "-R <1,9,10>";                                    
      StringToWordVector filter = new StringToWordVector(); 
      filter.setOptions(options);                          
      filter.setInputFormat(input.arff);                         
      Instances output_inst = Filter.useFilter(input_inst, filter);

正如您将在output.arff文件中看到的，一些属性从实例中消失（第一个实例->没有第一个属性，没有第三个属性等），这是为什么

运行筛选器的Java代码如下所示：

    @relation Instantzien_Bektorea

    @attribute 5_Ainf_Lema string
    @attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}
    @attribute 7_Arg_Pos_AzpiKat {ARR,IZB,LIB,ZKI,GAL,SIN,ADK,ADP,FAK,ERKARR,ERKIND,NOLARR,NOLGAL,DZH,BAN,ORD,DZG,ORO,PERARR,PERIND,IZGMGB,IZGGAL,BIH,ELK,JOK,JNT,HUTSA}
    @attribute 8_Arg_Kasua {abl,abu,abz,ala,soz,dat,des,erg,gel,gen,ine,ins,mot,abs,par,pro,bnk,desk,aurk,bald,emen,erlt,espl,haut,helb,kaus,konpl,kont,denb,mod,mos,ondo,zhg,neg,gen_post_ine,gen_post,gen_post_abs,ala_des,soz_post_ala,zero_post_abl,-}
    @attribute 9_Argumentuaren_FSint {-,subj,obj}
    @attribute 10_Arg_Posizioa {Aurretik,Atzetik}
    @attribute 11_Dist_HKop numeric
    @attribute 12_Dist_ArgKop numeric
    @attribute 13_Framea string
    @attribute 15_Frame_Unekoa string
    @attribute Klasea {arg0,arg1,arg2,argM*LOC,argM*TMP,argM*MNR,argM*Cause,argM*ADV,argM*PRP,argM*-,argM*NEG,argM*DIS}

    @data
    eta_gero,LOT,ARR,denb,-,Aurretik,999,1,argM_PRED_arg1,ARGM_PRED_arg1,argM*TMP
    Ainf_Lema,ADI,SIN,mod,-,Aurretik,1,1,argM_arg0_arg1_PRED,argM_arg0_ARG1_PRED,arg1
    Ainf_Lema,IZE,ARR,abs,subj,Aurretik,999,2,arg0_argM_arg1_PRED,ARG0_argM_arg1_PRED,arg0
...

@relation 'Train_Instantzien_Bektorea-weka.filters.unsupervised.attribute.StringToWordVector-R1,9,10-W1000-prune-rate-1.0-N0-stemmerweka.core.stemmers.NullStemmer-M1-tokenizerweka.core.tokenizers.WordTokenizer -delimiters \" \\r\\n\\t.,;:\\\'\\\"()?!\"'

@attribute 6_Arg_PosKat {IZE,ADJ,ADI,ADB,DET,IOR,LOT,PRT,ITJ,BST,ADL,ADT,SIG,SNB,LAB,POST}
@attribute 7_Arg_Pos_AzpiKat {ARR,IZB,LIB,ZKI,GAL,SIN,ADK,ADP,FAK,ERKARR,ERKIND,NOLARR,NOLGAL,DZH,BAN,ORD,DZG,ORO,PERARR,PERIND,IZGMGB,IZGGAL,BIH,ELK,JOK,JNT,HUTSA}
@attribute 8_Arg_Kasua {abl,abu,abz,ala,soz,dat,des,erg,gel,gen,ine,ins,mot,abs,par,pro,bnk,desk,aurk,bald,emen,erlt,espl,haut,helb,kaus,konpl,kont,denb,mod,mos,ondo,zhg,neg,gen_post_ine,gen_post,gen_post_abs,ala_des,soz_post_ala,zero_post_abl,-}
@attribute 9_Argumentuaren_FSint {-,subj,obj}
@attribute 10_Arg_Posizioa {Aurretik,Atzetik}
@attribute 11_Dist_HKop numeric
@attribute 12_Dist_ArgKop numeric
@attribute Klasea {arg0,arg1,arg2,argM*LOC,argM*TMP,argM*MNR,argM*Cause,argM*ADV,argM*PRP,argM*-,argM*NEG,argM*DIS}
@attribute ARG0_PRED_arg1 numeric
@attribute ARG0_arg1_PRED numeric
@attribute ARG0_arg1_PRED_arg1_argM numeric
@attribute ARG0_arg1_PRED_argM numeric
@attribute ARG0_arg1_PRED_argM_argM numeric
@attribute ARG0_argM_PRED numeric

...

@attribute argM_PRED_ARG1_argM_argM numeric
@attribute argM_PRED numeric
@attribute argM_PRED_arg1_ARGM numeric

...

@attribute ARGM_argM_PRED_arg1_argM numeric
@attribute arg0_ARGM_arg1_PRED numeric
@attribute arg0_ARGM_arg1_PRED_argM numeric
@attribute arg0_arg1_PRED_ARGM_argM numeric
@attribute eta_gero numeric
@attribute gaur numeric


@data
{0 LOT,2 denb,5 999,6 1,7 argM*TMP,90 1,162 1,197 1}
{0 ADI,1 SIN,2 mod,5 1,6 1,7 arg1,19 1,42 1,93 1}
{2 abs,3 subj,5 999,6 2,16 1,19 1,29 1}

      // StringToWordVector filter
      String[] options = new String[1];
      options[0] = "-R <1,9,10>";                                    
      StringToWordVector filter = new StringToWordVector(); 
      filter.setOptions(options);                          
      filter.setInputFormat(input.arff);                         
      Instances output_inst = Filter.useFilter(input_inst, filter);

//StringToWordVector过滤器
字符串[]选项=新字符串[1]；
选项[0]=“-R”；
StringToOrdVector过滤器=新的StringToOrdVector（）；
filter.setOptions（选项）；
filter.setInputFormat（input.arff）；
实例output\u inst=Filter.useFilter（input\u inst，Filter）；

你知道问题出在哪里吗？非常感谢。

首先，您的输入文件是正常的ARFF格式，而输出文件是稀疏的ARFF格式，因为它们以

开头，以

结尾。（请参阅有关的信息）

在这种稀疏格式中，值为0的属性将被忽略。所有当前属性都需要由其索引后跟值来指定。在上面的示例中（您的第一个实例）：

属性0=>LOT
省略属性1=>0
属性2=>denb
省略属性3=>0

如果你看属性1的定义，你会发现它不是数字而是名义上的，所以0是它的值的索引，在这种情况下

ARR

因此，没有缺少属性，它们只是在输出中被忽略，因为它是稀疏格式

如果您想知道为什么会有不同的属性：这是StringToOrdVector过滤器的结果