R 将自定义函数应用于每行只使用参数的第一个值_R_Apply_Na_Missing Data

R 将自定义函数应用于每行只使用参数的第一个值

R 将自定义函数应用于每行只使用参数的第一个值,r,apply,na,missing-data,R,Apply,Na,Missing Data,我正在尝试使用以下数据集将NA值重新编码为0： set.seed(1) df <- data.frame( id = c(1:10), trials = sample(1:3, 10, replace = T), t1 = c(sample(c(1:9, NA), 10)), t2 = c(sample(c(1:7, rep(NA, 3)), 10)), t3 = c(sample(c(1:5, rep(NA, 5)), 10)) ) 这适用于单个向量。当我尝试使

我正在尝试使用以下数据集将

NA

值重新编码为

：

set.seed(1)
df <- data.frame(
  id = c(1:10),
  trials = sample(1:3, 10, replace = T),
  t1 = c(sample(c(1:9, NA), 10)),
  t2 = c(sample(c(1:7, rep(NA, 3)), 10)),
  t3 = c(sample(c(1:5, rep(NA, 5)), 10))
  )

这适用于单个向量。当我尝试使用

apply（）

将同一函数应用于数据帧时，尽管：

apply(df, 1, replace0, num.sun = df$trials)

我收到一条警告说：

In 1:(num.sun + 2) :
  numerical expression has 10 elements: only the first used

结果是，

apply（）

只对每一行使用

trials

列中的第一个值，而不是让

num.sun

的值根据

trials

中的值更改每一行。如何应用该函数，使

num.sun

参数根据

df$trials

的值变化

谢谢

编辑：正如一些人评论的那样，原始示例数据有一些非NA分数，根据试验列，这些分数没有意义。以下是更正后的数据集：

df <- data.frame(
  id = c(1:5),
  trials = c(rep(1, 2), rep(2, 1), rep(3, 2)),
  t1 = c(NA, 7, NA, 6, NA),
  t2 = c(NA, NA, 3, 7, 12),
  t3 = c(NA, NA, NA, 4, NA)
)

df以下是一种方法：
x <- is.na(df)
df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0

x3-df$trials]df
id试验t1 t2 t3
1 1 3 NA 5
2不适用
3   3      2  6  6  4
4   4      3  0  1  2
515NA
6   6      3  7  0  0
7   7      3  8  7  0
8   8      2  4  5  1
9 9 2 1 3 NA
10 10      1  9  4  3
>x df[x&t（应用（x，1，cumsum））>3-df$trials]df
id试验t1 t2 t3
1 1 3 NA 5
2不适用
3   3      2  6  6  4
4   4      3  0  1  2
515NA
6   6      3  7  0  0
7   7      3  8  7  0
8   8      2  4  5  1
9 9 2 1 3 NA
10 10      1  9  4  3

注意：第1/3/10行是有问题的，因为非NA值比试验值多。
这里我只是使用双子集x[paste0（'t'，x['trials']）]]
重写了您的函数，这克服了第6行的其他两个解决方案中的问题
replace0 <- function(x){
         #browser()
         x_na <- x[paste0('t',x['trials'])]
         if(is.na(x_na)){x[paste0('t',x['trials'])] <- 0}
     return(x)
}

t(apply(df, 1, replace0))

     id trials t1 t2 t3
[1,]  1      1  3 NA  5
[2,]  2      2  2  2 NA
[3,]  3      2  6  6  4
[4,]  4      3 NA  1  2
[5,]  5      1  5 NA NA
[6,]  6      3  7 NA  0
[7,]  7      3  8  7  0
[8,]  8      2  4  5  1
[9,]  9      2  1  3 NA
[10,] 10      1  9  4  3

replace0另一种方法：
# create an index of the NA values
w <- which(is.na(df), arr.ind = TRUE)

# create an index with the max column by row where an NA is allowed to be replaced by a zero
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)

# subset 'w' such that only the NA's which fall in the scope of 'm' remain
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]

# use 'i' to replace the allowed NA's with a zero
df[i] <- 0

您可以轻松地将其包装到函数中：
replace0 <- function(x, num.sun) {
  x[which(is.na(x[1:(num.sun + 2)]))] <- 0
  return(x)
}

replace.NA.with.0 <- function(df) {
  w <- which(is.na(df), arr.ind = TRUE)
  m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
  i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
  df[i] <- 0
  return(df)
}

现在，使用replace.with.NA.or.0（df）
生成以下结果：
这里有一个tidyverse
方法，请注意，它不会提供与其他解决方案相同的输出
您的示例数据显示了“没有发生”的试验结果，我假设您的真实数据没有
library(tidyverse)
df %>%
  nest(matches("^t\\d")) %>%
  mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
  unnest

#    id trials t1 t2 t3
# 1   1      1  3 NA NA
# 2   2      2  2  2 NA
# 3   3      2  6  6 NA
# 4   4      3  0  1  2
# 5   5      1  5 NA NA
# 6   6      3  7  0  0
# 7   7      3  8  7  0
# 8   8      2  4  5 NA
# 9   9      2  1  3 NA
# 10 10      1  9 NA NA

使用更常用的收集
策略，这将是：
df %>%
  gather(k,v,matches("^t\\d")) %>%
  arrange(id) %>%
  group_by(id) %>%
  slice(1:first(trials)) %>%
  mutate_at("v",~replace(.,is.na(.),0)) %>%
  spread(k,v)

# # A tibble: 10 x 5
# # Groups:   id [10]
#       id trials    t1    t2    t3
#    <int>  <int> <dbl> <dbl> <dbl>
#  1     1      1     3    NA    NA
#  2     2      2     2     2    NA
#  3     3      2     6     6    NA
#  4     4      3     0     1     2
#  5     5      1     5    NA    NA
#  6     6      3     7     0     0
#  7     7      3     8     7     0
#  8     8      2     4     5    NA
#  9     9      2     1     3    NA
# 10    10      1     9    NA    NA

df%>%
聚集（k，v，匹配项（^t\\d））%>%
排列（id）%>%
分组依据（id）%>%
切片（1：首次（试验））%>%
在（“v”，~replace（，is.na（.），0））%>%
扩散（k，v）
##A tible:10 x 5
##组：id[10]
#id试验t1 t2 t3
#         
#1 1 3 NA NA
#2不适用
#3 3 2 6 NA
#  4     4      3     0     1     2
#515NA
#  6     6      3     7     0     0
#  7     7      3     8     7     0
#8245NA
#9 9 2 1 3 NA
#101019NA
第3行和第10行也是problematic@Moody_Mudskipper，谢谢你的来信。我编辑了答案以反映这一点。
replace.with.NA.or.0 <- function(df) {
  w <- which(is.na(df), arr.ind = TRUE)
  df[w] <- 0

  v <- tapply(m[,2], m[,1], FUN = function(x) tail(x:5,-1))
  ina <- matrix(as.integer(unlist(stack(v)[2:1])), ncol = 2)
  df[ina] <- NA

  return(df)
}

   id trials t1 t2 t3
1   1      1  3 NA NA
2   2      2  2  2 NA
3   3      2  6  6 NA
4   4      3  0  1  2
5   5      1  5 NA NA
6   6      3  7  0  0
7   7      3  8  7  0
8   8      2  4  5 NA
9   9      2  1  3 NA
10 10      1  9 NA NA

library(tidyverse)
df %>%
  nest(matches("^t\\d")) %>%
  mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
  unnest

#    id trials t1 t2 t3
# 1   1      1  3 NA NA
# 2   2      2  2  2 NA
# 3   3      2  6  6 NA
# 4   4      3  0  1  2
# 5   5      1  5 NA NA
# 6   6      3  7  0  0
# 7   7      3  8  7  0
# 8   8      2  4  5 NA
# 9   9      2  1  3 NA
# 10 10      1  9 NA NA

df %>%
  gather(k,v,matches("^t\\d")) %>%
  arrange(id) %>%
  group_by(id) %>%
  slice(1:first(trials)) %>%
  mutate_at("v",~replace(.,is.na(.),0)) %>%
  spread(k,v)

# # A tibble: 10 x 5
# # Groups:   id [10]
#       id trials    t1    t2    t3
#    <int>  <int> <dbl> <dbl> <dbl>
#  1     1      1     3    NA    NA
#  2     2      2     2     2    NA
#  3     3      2     6     6    NA
#  4     4      3     0     1     2
#  5     5      1     5    NA    NA
#  6     6      3     7     0     0
#  7     7      3     8     7     0
#  8     8      2     4     5    NA
#  9     9      2     1     3    NA
# 10    10      1     9    NA    NA