Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/ms-access/4.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R:LIME在';情况并非如此_R_Text Classification_Quanteda_Lime - Fatal编程技术网

R:LIME在';情况并非如此

R:LIME在';情况并非如此,r,text-classification,quanteda,lime,R,Text Classification,Quanteda,Lime,我正在构建克林顿和特朗普推特的文本分类器(数据可以在上找到) 我正在使用quanteda软件包进行EDA和建模: library(dplyr) library(stringr) library(quanteda) library(lime) #data prep tweet_csv <- read_csv("tweets.csv") tweet_data <- tweet_csv %>% select(author = handle, text, r

我正在构建克林顿和特朗普推特的文本分类器(数据可以在上找到)

我正在使用
quanteda
软件包进行EDA和建模:

library(dplyr)
library(stringr)
library(quanteda)
library(lime)

#data prep
tweet_csv <- read_csv("tweets.csv")
tweet_data <- tweet_csv %>% 
  select(author = handle,
     text,
     retweet_count,
     favorite_count,
     source_url,
     timestamp = time) %>% 
mutate(date = as_date(str_sub(timestamp, 1, 10)),
     hour = hour(hms(str_sub(timestamp, 12, 19))),
     tweet_num = row_number()) %>% 
select(-timestamp)

# creating corpus and dfm
tweet_corpus <- corpus(tweet_data)

edited_dfm <- dfm(tweet_corpus, remove_url = TRUE, remove_punct = TRUE,     remove = stopwords("english"))

set.seed(32984)
trainIndex <- sample.int(n = nrow(tweet_csv), size =     floor(.8*nrow(tweet_csv)), replace = F)

train_dfm <- edited_dfm[as.vector(trainIndex), ]
train_raw <- tweet_data[as.vector(trainIndex), ]
train_label <- train_raw$author == "realDonaldTrump"

test_dfm <- edited_dfm[-as.vector(trainIndex), ]
test_raw <- tweet_data[-as.vector(trainIndex), ]
test_label <- test_raw$author == "realDonaldTrump"

# making sure train and test sets have the same features
test_dfm <- dfm_select(test_dfm, train_dfm)

# using quanteda's NB model
nb_model <- quanteda::textmodel_nb(train_dfm, train_labels)
nb_preds <- predict(nb_model, test_dfm) 


# defining textmodel_nb as classification model
class(nb_model)

model_type.textmodel_nb_fitted <- function(x, ...) {
  return("classification")
}

# a wrapper-up function for data preprocessing

get_matrix <- function(df){
  corpus <- corpus(df)
  dfm <- dfm(corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
}
predict.textmodel\u nb\u fitted中出错(x,newdata=newdata,type=type,: 新数据中的特征集与训练集中的特征集不同


这与
quanteda
和dfms有关吗?我真的不明白为什么会发生这种情况。任何帮助都会很好,谢谢!

我们可以将错误追溯到
predict\u model
,它调用
predict.textmodel\u nb\u fitted
(我只使用了
train\u raw
的前10行来加速计算):

问题是
predict.textmodel\u nb\u fitted
需要的是dfm,而不是数据帧。例如,
predict(nb\u model,test\u raw[1:5])
会给您相同的“新数据中的特征集与训练集中的特征集不同”错误。但是,
explain
将数据帧作为其
x
参数

解决方案是为
predict\u model
编写一个自定义
textmodel\u nb\u fitted
方法,在调用
predict之前进行必要的对象转换。textmodel\u nb\u fitted

predict_model.textmodel_nb_fitted <- function(x, newdata, type, ...) {
  X <- corpus(newdata)
  X <- dfm_select(dfm(X), x$data$x)   
  res <- predict(x, newdata = X, ...)
  switch(
   type,
   raw = data.frame(Response = res$nb.predicted, stringsAsFactors = FALSE),
   prob = as.data.frame(res$posterior.prob, check.names = FALSE)
  )  
}

predict\u model.textmodel\u nb\u请参见以下答案:。您需要在
newdata
参数上使用
dfm\u select()
,以
predict()
。你好@Ken,是的,我看到了这个问题,但是黄伟煌提供的答案解决了这个问题。但是,正如你从下面的评论中看到的,我不得不改变一些分析步骤,这导致了不同的问题。你可以检查一个新问题,我将感谢任何提示!!谢谢你,@Weihuang,你的答案确实解决了这个问题但我意识到莱姆不会只看文本功能,打印解释很有挑战性。我重新措辞了我的问题,并改变了几个步骤来解决它,但这次我遇到了不同的错误。我发布了一个新问题,所以我想知道你是否可以看一看?再次感谢你的帮助,这是非常宝贵的!
explanation <- lime::explain(train_raw[1:5], 
                              explainer, 
                              n_labels = 1,
                              n_features = 6,
                              cols = 2,
                              verbose = 0)
traceback()
# 7: stop("feature set in newdata different from that in training set")
# 6: predict.textmodel_nb_fitted(x, newdata = newdata, type = type, 
#        ...)
# 5: predict(x, newdata = newdata, type = type, ...)
# 4: predict_model.default(explainer$model, case_perm, type = o_type)
# 3: predict_model(explainer$model, case_perm, type = o_type)
# 2: explain.data.frame(train_raw[1:10, 1:5], explainer, n_labels = 1, 
#        n_features = 5, cols = 2, verbose = 0)
# 1: lime::explain(train_raw[1:10, 1:5], explainer, n_labels = 1, 
#        n_features = 5, cols = 2, verbose = 0)
predict_model.textmodel_nb_fitted <- function(x, newdata, type, ...) {
  X <- corpus(newdata)
  X <- dfm_select(dfm(X), x$data$x)   
  res <- predict(x, newdata = X, ...)
  switch(
   type,
   raw = data.frame(Response = res$nb.predicted, stringsAsFactors = FALSE),
   prob = as.data.frame(res$posterior.prob, check.names = FALSE)
  )  
}
explanation <- lime::explain(train_raw[1:10, 1:5], 
                              explainer,
                              n_labels = 1,
                              n_features = 5,
                              cols = 2,
                              verbose = 0)
explanation[1, 1:5]
#       model_type case label label_prob    model_r2
# 1 classification    1 FALSE  0.9999986 0.001693861