使用Rtexttools库进行机器学习

使用Rtexttools库进行机器学习,r,svn,machine-learning,R,Svn,Machine Learning,我有以下训练设备: Text,y MRR 93345,1 MRR 93434,1 MRR 93554,1 MRR 938900,1 MRR 93970,1 MRR 937899,1 MRR 93868,1 MRR 938769,1 MRR 93930,1 MRR 92325,1 MRR 931932,1 MRR 933922,1 MRR 934390,1 MRR 93204,1

我有以下训练设备:

    Text,y
    MRR 93345,1
    MRR 93434,1
    MRR 93554,1
    MRR 938900,1
    MRR 93970,1
    MRR 937899,1
    MRR 93868,1
    MRR 938769,1
    MRR 93930,1
    MRR 92325,1
    MRR 931932,1
    MRR 933922,1
    MRR 934390,1
    MRR 93204,1
    MRR 93023,1
    MRR 930982,1
    MRR 87678,-1
    MRR 87956,-1
    MRR 87890,-1
    MRR 878770,-1
    MRR 877886,-1
    MRR 87678367,-1
    MRR 8790,-1
    MRR 87345,-1
    MRR 87149,-1
    MRR 873790,-1
    MRR 873493,-1
    MRR 874303,-1
    MRR 874343,-1
    MRR 874304,-1
    MRR 879034,-1
    MRR 879430,-1
    MRR 87943,-1
    MRR 879434,-1
    MRR 871984,-1
    MRR 873949,-1
我的代码如下:

# Create the document term matrix
dtMatrix <- create_matrix(data["Text"],language="english", removePunctuation=TRUE, stripWhitespace=TRUE,
                          toLower=TRUE,
                          removeStopwords=TRUE,
                          stemWords=TRUE, removeSparseTerms=.998) 

# Configure the training data
container <- create_container(dtMatrix, data$y, trainSize=1:nrow(dtMatrix), virgin=FALSE) 
# train a SVM Model
model <- train_model(container, "SVM", kernel="linear" ,cost=1)

# new data
predictionData <- list("MRR 93111") 

# create a prediction document term matrix 
predMatrix <- create_matrix(predictionData, originalMatrix=dtMatrix,language="english", removePunctuation=TRUE, stripWhitespace=TRUE,
                            toLower=TRUE,
                            removeStopwords=TRUE,
                            stemWords=TRUE, removeSparseTerms=.998) 

# create the corresponding container
predSize = length(predictionData);
predictionContainer <- create_container(predMatrix, labels=rep(0,predSize), testSize=1:predSize, virgin=FALSE) 

# predict
results <- classify_model(predictionContainer, model)

您的问题是代码在单词级别上使用了训练数据和分类

> dtMatrix$dimnames$Terms
 [1] "87149"    "871984"   "87345"    "873493"   "873790"   "873949"   "874303"   "874304"   "874343"   "87678"    "87678367"
[12] "877886"   "878770"   "87890"    "8790"     "879034"   "87943"    "879430"   "879434"   "87956"    "92325"    "93023"   
[23] "930982"   "93111"    "931932"   "93204"    "93345"    "933922"   "93434"    "934390"   "93554"    "937899"   "93868"   
[34] "938769"   "938900"   "93930"    "93970"    "mrr"
我不完全确定SVM是如何处理这些数字字符串的,但它似乎不太关心字符串的93部分。将字符串拆分为caharacter会使单个数字具有更大的权重:

df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))
[1] “0”“1”“2”“3”“4”“5”“6”“7”“8”“9”“m”“r”

更重要的是:

> results 
  SVM_LABEL  SVM_PROB
1         1 0.9144185
我最近参加了一个关于RTextTools和SVM的研讨会,他们说,使用SVM,每次训练模型时都会得到稍微不同的结果。我不完全清楚为什么,所以我不想解释,但我们被推荐了一本免费的书,叫做《R中应用的统计学习导论》,来阅读支持向量机

以下是完整的代码:

df <- structure(list(Text = structure(c(26L, 28L, 30L, 34L, 36L, 31L, 
                                        32L, 33L, 35L, 21L, 24L, 27L, 29L, 25L, 22L, 23L, 10L, 20L, 14L, 
                                        13L, 12L, 11L, 15L, 3L, 1L, 5L, 4L, 7L, 9L, 8L, 16L, 18L, 17L, 
                                        19L, 2L, 6L), .Label = c("MRR   87149", "MRR 871984", "MRR 87345", 
                                                                 "MRR 873493", "MRR 873790", "MRR 873949", "MRR 874303", "MRR 874304", 
                                                                 "MRR 874343", "MRR 87678", "MRR 87678367", "MRR 877886", "MRR 878770", 
                                                                 "MRR 87890", "MRR 8790", "MRR 879034", "MRR 87943", "MRR 879430", 
                                                                 "MRR 879434", "MRR 87956", "MRR 92325", "MRR 93023", "MRR 930982", 
                                                                 "MRR 931932", "MRR 93204", "MRR 93345", "MRR 933922", "MRR 93434", 
                                                                 "MRR 934390", "MRR 93554", "MRR 937899", "MRR 93868", "MRR 938769", 
                                                                 "MRR 938900", "MRR 93930", "MRR 93970"), class = "factor"), Y = c(1L, 
                                                                                                                                   1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 
                                                                                                                                   -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, 
                                                                                                                                   -1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("Text", "Y"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                -36L))



df$Text <- as.character(df$Text)  
# new data
df[nrow(df)+1,] <- c("MRR    93111","")
df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))

# Create the document term matrix
dtMatrix <- create_matrix(df$Text,language="english", minWordLength=1, 
                          removePunctuation=TRUE, stripWhitespace=TRUE,
                          toLower=TRUE, removeStopwords=TRUE,
                          stemWords=TRUE, removeSparseTerms=.998) 


dtMatrix$dimnames$Terms
dtMatrix$dimnames$Docs

# Configure the training data
container <- create_container(dtMatrix, df$Y, trainSize=1:36, testSize = 37, virgin=TRUE) 

container <- create_container(dtMatrix,
                              labels=df$Y, trainSize=1:36, testSize = 37, virgin=TRUE)

# train a SVM Model
model <- train_model(container, "SVM",kernel="linear" ,cost=1) ##??

results <- classify_model(container,model)

results 

df你能给我们提供dput而不是编写你的培训集吗?更新1:你需要这个吗?非常感谢JonGrub,你已经解决了这个问题。顺便说一下,我不明白为什么“stripWhitespace”标志实际上不起作用。根据我的理解,它应该删除空白,这样训练集就不会出现空白,例如:“MRR93345”应该变成“MRR93345”,并作为一个独特的词素处理。在这种情况下,我们将有一个较少分散的矩阵和匹配的可能性预测,其中cointains“MRR93”是绝对较高的。但也许我错过了什么…你对条纹空白的作用有错误的想法。来自tm帮助:“包含多个空白字符的文本文档折叠为一个空白”。这可能很有用,因为文本拆分为单词的方式是从一个空格提取到下一个空格。但是,在RTexTools中,它似乎根本不起作用,因为stripWhitespace=TRUE或FALSE具有相同的结果。试着使用
dtMatrix好吧,我的想法就像它是一个trim()函数,而它基本上只将N个空格减少到1。例如:[MRR/x20/x20/x2093111]->[MRR 93111]在你的函数之后我得到了这个错误:
strsplit中的错误(dt$Text[i],split=”“):非字符参数。
如果我将其转换为strsplit(as.character(dt$Text[i]),split=“
当生成矩阵时,我得到:
在[.simple\u triplet\u矩阵(矩阵,排序(colnames(矩阵))中的错误:无效的下标类型:NULL。
在您提供的dput中,文本是类因子(
class(df$text)
)。请尝试
df$text
> dtMatrix$dimnames$Terms
> results 
  SVM_LABEL  SVM_PROB
1         1 0.9144185
df <- structure(list(Text = structure(c(26L, 28L, 30L, 34L, 36L, 31L, 
                                        32L, 33L, 35L, 21L, 24L, 27L, 29L, 25L, 22L, 23L, 10L, 20L, 14L, 
                                        13L, 12L, 11L, 15L, 3L, 1L, 5L, 4L, 7L, 9L, 8L, 16L, 18L, 17L, 
                                        19L, 2L, 6L), .Label = c("MRR   87149", "MRR 871984", "MRR 87345", 
                                                                 "MRR 873493", "MRR 873790", "MRR 873949", "MRR 874303", "MRR 874304", 
                                                                 "MRR 874343", "MRR 87678", "MRR 87678367", "MRR 877886", "MRR 878770", 
                                                                 "MRR 87890", "MRR 8790", "MRR 879034", "MRR 87943", "MRR 879430", 
                                                                 "MRR 879434", "MRR 87956", "MRR 92325", "MRR 93023", "MRR 930982", 
                                                                 "MRR 931932", "MRR 93204", "MRR 93345", "MRR 933922", "MRR 93434", 
                                                                 "MRR 934390", "MRR 93554", "MRR 937899", "MRR 93868", "MRR 938769", 
                                                                 "MRR 938900", "MRR 93930", "MRR 93970"), class = "factor"), Y = c(1L, 
                                                                                                                                   1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 
                                                                                                                                   -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, 
                                                                                                                                   -1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("Text", "Y"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                -36L))



df$Text <- as.character(df$Text)  
# new data
df[nrow(df)+1,] <- c("MRR    93111","")
df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))

# Create the document term matrix
dtMatrix <- create_matrix(df$Text,language="english", minWordLength=1, 
                          removePunctuation=TRUE, stripWhitespace=TRUE,
                          toLower=TRUE, removeStopwords=TRUE,
                          stemWords=TRUE, removeSparseTerms=.998) 


dtMatrix$dimnames$Terms
dtMatrix$dimnames$Docs

# Configure the training data
container <- create_container(dtMatrix, df$Y, trainSize=1:36, testSize = 37, virgin=TRUE) 

container <- create_container(dtMatrix,
                              labels=df$Y, trainSize=1:36, testSize = 37, virgin=TRUE)

# train a SVM Model
model <- train_model(container, "SVM",kernel="linear" ,cost=1) ##??

results <- classify_model(container,model)

results