R 试图用数值替换数据帧中的字符值,错误“0”;无效的因子级别,NA生成“;

R 试图用数值替换数据帧中的字符值,错误“0”;无效的因子级别,NA生成“;,r,R,我正在尝试做一些预处理,希望将classe因子值{A,B,C,D,E}转换为{1,2,3,4,5} classe列的类型为factor,我已经提供了所有步骤,请参见以下内容: #get the data training <- read.table("http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",header=TRUE, sep=",", na.strings="NA", dec="

我正在尝试做一些预处理,希望将
classe
因子值
{A,B,C,D,E}
转换为
{1,2,3,4,5}

classe
列的类型为
factor
,我已经提供了所有步骤,请参见以下内容:

    #get the data
    training <- read.table("http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
    training_df <- data.frame(training,stringsAsFactors=FALSE)

    #split to training & test sets
    inTrain <- createDataPartition(y=training$classe, p=0.75, list=FALSE)
    training_data <- training[inTrain,]
    testing_data <- training[-inTrain,]

    #subset based on columns of interest, based on previous studies
    training_data_subset <- subset(training_data, select=c("avg_roll_belt","var_roll_belt","var_total_accel_belt","amplitude_roll_belt","max_roll_belt","var_roll_belt",
    "var_accel_arm","magnet_arm_x","magnet_arm_y","magnet_arm_z","accel_dumbbell_y","accel_dumbbell_z","magnet_dumbbell_x","gyros_dumbbell_x",
    "gyros_dumbbell_y","gyros_dumbbell_z","pitch_forearm","gyros_forearm_x","gyros_forearm_y","classe"))

    #see which columns are factors, the training_data_subset#classe feature is a factor
    sapply(training_data_subset, class)

#sapply output

 avg_roll_belt        var_roll_belt var_total_accel_belt  amplitude_roll_belt        max_roll_belt 
           "numeric"            "numeric"            "numeric"            "numeric"            "numeric" 
     var_roll_belt.1        var_accel_arm         magnet_arm_x         magnet_arm_y         magnet_arm_z 
           "numeric"            "numeric"            "integer"            "integer"            "integer" 
    accel_dumbbell_y     accel_dumbbell_z    magnet_dumbbell_x     gyros_dumbbell_x     gyros_dumbbell_y 
           "integer"            "integer"            "integer"            "numeric"            "numeric" 
    gyros_dumbbell_z        pitch_forearm      gyros_forearm_x      gyros_forearm_y               classe 
           "numeric"            "numeric"            "numeric"            "numeric"             "factor" 
factorsToNumeric <- function(data)
{
    data_numeric <- data
    data_numeric$classe <-as.numeric(factor(toupper(as.character(data_numeric$classe))))
    #loop through the data frame based on replace values
    for(i in 1:nrow(data_numeric)) 
    {

    if ((data_numeric[i,]$classe == "A") || (data_numeric[i,]$classe  == "a")) 
    {data_numeric[i,]$classe <- "1"}
    else if ((data_numeric[i,]$classe  == "B") || (data_numeric[i,]$classe  == "b"))
    {data_numeric[i,]$classe <- "2"}
    else if ((data_numeric[i,]$classe  == "C") || (data_numeric[i,]$classe  == "c"))
    {data_numeric[i,]$classe <- "3"}
    else if ((data_numeric[i,]$classe  == "D") || (data_numeric[i,]$classe  == "d"))
    {data_numeric[i,]$classe <- "4"}
    else if ((data_numeric[i,]$classe  == "E") || (data_numeric[i,]$classe  == "e"))
    {data_numeric[i,]$classe <- "5"}
    else 
    {
    #do nothing 
    }

    }

    return (data_numeric)
}

但是,head函数确认上述错误&所有值A、B、C、D、E都被错误地替换为
NA

如果要转换
训练数据子集的
classe
列,则无需定义自己的函数。您可以使用
字母
向量:

sapply(training_data_subset[,'classe'], function(x) which(LETTERS==x))

因子不是这样工作的。你不能用简单的
来更改值。你可以尝试以下方法
训练\u数据\u子集$classe@docendo discimus,我在factorsToNumeric()函数的开头添加了你建议的正确变体,它起了作用,请参见编辑的帖子。你误解了。该代码应替换该函数。
Warning messages:
1: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
2: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
3: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
4: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
5: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
6: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
7: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
8: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
9: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
  invalid factor level, NA generated
 sapply(training_data_subset_numeric, class)

 avg_roll_belt        var_roll_belt var_total_accel_belt  amplitude_roll_belt        max_roll_belt 
       "numeric"            "numeric"            "numeric"            "numeric"            "numeric" 
 var_roll_belt.1        var_accel_arm         magnet_arm_x         magnet_arm_y         magnet_arm_z 
       "numeric"            "numeric"            "integer"            "integer"            "integer" 
accel_dumbbell_y     accel_dumbbell_z    magnet_dumbbell_x     gyros_dumbbell_x     gyros_dumbbell_y 
       "integer"            "integer"            "integer"            "numeric"            "numeric" 
gyros_dumbbell_z        pitch_forearm      gyros_forearm_x      gyros_forearm_y               classe 
       "numeric"            "numeric"            "numeric"            "numeric"            "numeric"
sapply(training_data_subset[,'classe'], function(x) which(LETTERS==x))
set.seed(2)
x <- sample(training$classe, 20)
x
# [1] A D C A E E A E B C C A D A B E E A B A
# Levels: A B C D E
levels(x) <- 1:5
x
# [1] 1 4 3 1 5 5 1 5 2 3 3 1 4 1 2 5 5 1 2 1
# Levels: 1 2 3 4 5
levels(training$classe) <- 1:5
str(training$classe)
# Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...