Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/82.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在data.frame中对多列中的行重新排序,然后删除仅使用NAs的行_R_Dataframe - Fatal编程技术网

在data.frame中对多列中的行重新排序,然后删除仅使用NAs的行

在data.frame中对多列中的行重新排序,然后删除仅使用NAs的行,r,dataframe,R,Dataframe,我有一个大data.frame,大约有100.000行和12列(3列包含变量,9个值/度量值),其中包含大量NA,类似于以下内容: ##Example data.frame Var1 <- c(rep("N01", 9), rep("N02",9)) Var2 <- c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c") Val1 <- c(NA,2,1,2,NA,1, NA,

我有一个大data.frame,大约有100.000行和12列(3列包含变量,9个值/度量值),其中包含大量NA,类似于以下内容:

##Example data.frame
Var1 <- c(rep("N01", 9), rep("N02",9))  
Var2 <- c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c")  
Val1 <- c(NA,2,1,2,NA,1, NA,2,NA, 2,NA,NA,NA,2,NA,1,NA,2)  
Val2 <- c(2,NA,1,NA,2,NA,2,NA,2,NA,2,2,2,NA,2,NA,2,NA)  
data <- data.frame(Var1,Var2,Val1,Val2)  
data  

Var1 Var2 Val1 Val2
N01    a   NA    2
N01    a    2   NA
N01    a    1    1
N01    b    2   NA
N01    b   NA    2
N01    b    1   NA
N01    c   NA    2
N01    c    2   NA
N01    c   NA    2
N02    a    2   NA
N02    a   NA    2
N02    a   NA    2
N02    b   NA    2
N02    b    2   NA
N02    b   NA    2
N02    c    1   NA
N02    c   NA    2
N02    c    2   NA
我编写了一个小循环,它将data.frame子集化为Var1和Var2的所有可能组合
然后分别订购Val1和Val2,并删除Val1和Val2仅为NAs的行。
这是可行的,但它看起来相当复杂,而且对于大数据框架来说,速度不是很快。有没有人有更好的方法从第一个data.frame到最后一个的想法。
提前谢谢

#Small loop
library(dplyr)
level.var1 <- unique(Var1)
level.var2 <- unique(Var2)
Res.list1 <- list()
Res.list2 <- list()

for(i in 1:length(level.var1)){
  df.1 <- dplyr::filter(data, Var1==level.var1[i])
  for(o in 1:length(level.var2)){
    df.2 <- dplyr::filter(df.1, Var2==level.var2[o])
    Val1.s <- sort(df.2$Val1, na.last=TRUE)
    Val2.s <- sort(df.2$Val2, na.last=TRUE)
    df.3 <- data.frame(df.2[,c(1:2)], Val1.s, Val2.s)
    row_to_del <- apply(df.3[,c(3,4)], 1, function(x) all(is.na(x)))
    df.4 <- df.3[!row_to_del,]
    Res.list1[[o]] <- df.4
  }
  df.5 <- do.call(rbind, Res.list1)
  Res.list2[[i]] <- df.5  
}

Res.final <- do.call(rbind, Res.list2) 
#小循环
图书馆(dplyr)

level.var1我发现
data.table
非常适合这种操作。正确指定
cols
变量后,以下解决方案将适用于任意数量的所需列

library(data.table)

## Define the columns you want to filter by
cols <- paste0("Val", 1:2)

## Sort the desired columns by group while sending the NAs to the end
setDT(data)[, (cols) := lapply(.SD, sort, na.last = TRUE), 
              .SDcols = cols, 
              by = .(Var1, Var2)]

## Define an index which will check which rows have NAs for all the columns
indx <- rowSums(is.na(data[, cols, with = FALSE])) < length(cols)

## A simple subset by condition
data[indx]

#     Var1 Var2 Val1 Val2
#  1:  N01    a    1    1
#  2:  N01    a    2    2
#  3:  N01    b    1    2
#  4:  N01    b    2   NA
#  5:  N01    c    2    2
#  6:  N01    c   NA    2
#  7:  N02    a    2    2
#  8:  N02    a   NA    2
#  9:  N02    b    2    2
# 10:  N02    b   NA    2
# 11:  N02    c    1    2
# 12:  N02    c    2   NA
库(data.table)
##定义要按其进行筛选的列

cols正如旁注:如果使用tbl_df-format(dplyr包),使用setDT时会出现一个错误:“
[.tbl_df
(setDT(data),
:=
((cols),lapply(.SD,sort,na.last=TRUE)),:未使用的参数(.SDcols=cols,by=(Var1,Var2,Var3))’。否则它可以正常工作。感谢您使用的是什么
数据。table
版本?我想它已经在最新版本中修复了。您是对的。我使用的是1.9.4版。1.9.6版(最新版本)可以正常工作。
#Small loop
library(dplyr)
level.var1 <- unique(Var1)
level.var2 <- unique(Var2)
Res.list1 <- list()
Res.list2 <- list()

for(i in 1:length(level.var1)){
  df.1 <- dplyr::filter(data, Var1==level.var1[i])
  for(o in 1:length(level.var2)){
    df.2 <- dplyr::filter(df.1, Var2==level.var2[o])
    Val1.s <- sort(df.2$Val1, na.last=TRUE)
    Val2.s <- sort(df.2$Val2, na.last=TRUE)
    df.3 <- data.frame(df.2[,c(1:2)], Val1.s, Val2.s)
    row_to_del <- apply(df.3[,c(3,4)], 1, function(x) all(is.na(x)))
    df.4 <- df.3[!row_to_del,]
    Res.list1[[o]] <- df.4
  }
  df.5 <- do.call(rbind, Res.list1)
  Res.list2[[i]] <- df.5  
}

Res.final <- do.call(rbind, Res.list2) 
library(data.table)

## Define the columns you want to filter by
cols <- paste0("Val", 1:2)

## Sort the desired columns by group while sending the NAs to the end
setDT(data)[, (cols) := lapply(.SD, sort, na.last = TRUE), 
              .SDcols = cols, 
              by = .(Var1, Var2)]

## Define an index which will check which rows have NAs for all the columns
indx <- rowSums(is.na(data[, cols, with = FALSE])) < length(cols)

## A simple subset by condition
data[indx]

#     Var1 Var2 Val1 Val2
#  1:  N01    a    1    1
#  2:  N01    a    2    2
#  3:  N01    b    1    2
#  4:  N01    b    2   NA
#  5:  N01    c    2    2
#  6:  N01    c   NA    2
#  7:  N02    a    2    2
#  8:  N02    a   NA    2
#  9:  N02    b    2    2
# 10:  N02    b   NA    2
# 11:  N02    c    1    2
# 12:  N02    c    2   NA