R 循环遍历每一行并比较被迭代行的多列中的值
我每年都有以下数据框R 循环遍历每一行并比较被迭代行的多列中的值,r,R,我每年都有以下数据框 ID Jan Feb March April May Jun Jul Aug Sept Oct Nov Dec ABC 0 0 0 1 0 0 0 0 1 0 0 0 DEF 0 0 0 1 1 0 0 0 1 0 0 0 GHI 0 0 0 1 0 1 0 0 0 1 0 0 MNO 0 0
ID Jan Feb March April May Jun Jul Aug Sept Oct Nov Dec
ABC 0 0 0 1 0 0 0 0 1 0 0 0
DEF 0 0 0 1 1 0 0 0 1 0 0 0
GHI 0 0 0 1 0 1 0 0 0 1 0 0
MNO 0 0 0 1 0 1 0 0 1 0 0 0
QAL 0 1 1 1 0 0 1 0 0 1 0 0
我希望遍历每一行,然后找到该列,在该列之后的三列为0。我想得到这样的东西,上面写着至少3个月内没有0的月份:
ID col1 col2
ABC April Sept
DEF May Sept
GHI Jun N/A
MNO Sept N/A
QAL N/A N/A
我已经知道了如何遍历向量并获得索引
vec2 <- names(yearly)
# vec is each row of yearly
for(i in 1:length(vec)){
if(vec[i]==1){
if(vec[i+1]+vec[i+2]+vec[i+3]==0){ #I think R automatically takes care of the out of bounds index
print(vec2[1])
print(vec2[i+1])
}
}
}
}
}
vec2由于每行的答案数量可变,我会选择列表。这种方法使用rle
查找零的运行,然后检查运行中是否有超过2个。然后返回这些运行之前月份的名称
# Data
df <- read.table(text = "ID Jan Feb March April May Jun Jul Aug Sept Oct Nov Dec
ABC 0 0 0 1 0 0 0 0 1 0 0 0
DEF 0 0 0 1 1 0 0 0 1 0 0 0
GHI 0 0 0 1 0 1 0 0 0 1 0 0
MNO 0 0 0 1 0 1 0 0 1 0 0 0
QAL 0 1 1 1 0 0 1 0 0 1 0 0",
header = TRUE)
# Repackage as list (rows become elements of list)
df_list <- setNames(split(df[, -1], seq(nrow(df))), rownames(df$ID))
# Count function
morpheus_count <- function(x){
#Run Length Encoding
tmp <- rle(x)
# Return months preceding a run of three (or greater) zeroes
names(tmp$values)[which(tmp$values==0 & tmp$lengths>2)-1]
}
# Run on list
lapply(df_list, morpheus_count)
由于每行的答案数量可变,所以我选择列表。这种方法使用rle
查找零的运行,然后检查运行中是否有超过2个。然后返回这些运行之前月份的名称
# Data
df <- read.table(text = "ID Jan Feb March April May Jun Jul Aug Sept Oct Nov Dec
ABC 0 0 0 1 0 0 0 0 1 0 0 0
DEF 0 0 0 1 1 0 0 0 1 0 0 0
GHI 0 0 0 1 0 1 0 0 0 1 0 0
MNO 0 0 0 1 0 1 0 0 1 0 0 0
QAL 0 1 1 1 0 0 1 0 0 1 0 0",
header = TRUE)
# Repackage as list (rows become elements of list)
df_list <- setNames(split(df[, -1], seq(nrow(df))), rownames(df$ID))
# Count function
morpheus_count <- function(x){
#Run Length Encoding
tmp <- rle(x)
# Return months preceding a run of three (or greater) zeroes
names(tmp$values)[which(tmp$values==0 & tmp$lengths>2)-1]
}
# Run on list
lapply(df_list, morpheus_count)
有不同的方法来解决这个问题:
字符串匹配
此方法使用字符串匹配,因此依赖于字符长度为1的值:
library(data.table)
library(magrittr)
yearly[,
{
Reduce(paste0, .SD) %>%
stringr::str_locate_all("1000") %>%
as.data.table()
},
.SDcols = -"ID", by = "ID"][
, .(ID, month = names(yearly)[start + 1L])]
可根据OP的要求将其重塑为宽格式:
yearly[,
{
Reduce(paste0, .SD) %>%
stringr::str_locate_all("1000") %>%
as.data.table()
},
.SDcols = -"ID", by = "ID"][
, .(ID, month = names(yearly)[start + 1L])][
, dcast(.SD, ID ~ rowid(ID, prefix = "col"))][
yearly[, ID], on = "ID"]
ID col1 col2
1:ABC四月至九月
2:DEF五月至九月
3:吉军
4:MNO 9月
5:QAL
数据
每年有不同的方法来解决这个问题:
字符串匹配
此方法使用字符串匹配,因此依赖于字符长度为1的值:
library(data.table)
library(magrittr)
yearly[,
{
Reduce(paste0, .SD) %>%
stringr::str_locate_all("1000") %>%
as.data.table()
},
.SDcols = -"ID", by = "ID"][
, .(ID, month = names(yearly)[start + 1L])]
可根据OP的要求将其重塑为宽格式:
yearly[,
{
Reduce(paste0, .SD) %>%
stringr::str_locate_all("1000") %>%
as.data.table()
},
.SDcols = -"ID", by = "ID"][
, .(ID, month = names(yearly)[start + 1L])][
, dcast(.SD, ID ~ rowid(ID, prefix = "col"))][
yearly[, ID], on = "ID"]
ID col1 col2
1:ABC四月至九月
2:DEF五月至九月
3:吉军
4:MNO 9月
5:QAL
数据
年度数据:
请注意:
- 确保数据类型为
data.frame
- 确保仅对
0,1
数据应用fun1
。这就是调用df[,-1]
的原因
- 您可以更改
n
内fun1
中的其他条件
数据:
请注意:
- 确保数据类型为
data.frame
- 确保仅对
0,1
数据应用fun1
。这就是调用df[,-1]
的原因
- 您可以更改
n
内fun1
中的其他条件
为什么GHI不是10月6日???10月10日之后,你需要30秒,但只有20秒,所以我才知道!为什么GHI不是JUN,OCT???在OCT之后,你需要30秒,但是只有20秒,所以我才知道!
library(data.table)
library(magrittr)
# pattern to find matches
tmp <- data.table(1L, 0L, 0L, 0L, month = "")
# column 1 is the ID column
lapply(2:10, function(x)
{ # rename col names for join of subsequent columns
setnames(tmp, 1:4, names(yearly)[x:(x+3)])
# append starting month of sequence
tmp[, month := names(yearly)[x]]
# inner join
yearly[tmp, on = head(names(tmp), -1L), .(ID, month), nomatch = 0L]
}) %>%
# convert list to data.table
rbindlist() %>%
# reshape to wide format and append missing ID rows
dcast(ID ~ rowid(ID, prefix = "col")) %>%
.[yearly[, ID], on = "ID"]
ID col1 col2
1: ABC April Sept
2: DEF May Sept
3: GHI Jun <NA>
4: MNO Sept <NA>
5: QAL <NA> <NA>
yearly <- fread(
"ID Jan Feb March April May Jun Jul Aug Sept Oct Nov Dec
ABC 0 0 0 1 0 0 0 0 1 0 0 0
DEF 0 0 0 1 1 0 0 0 1 0 0 0
GHI 0 0 0 1 0 1 0 0 0 1 0 0
MNO 0 0 0 1 0 1 0 0 1 0 0 0
QAL 0 1 1 1 0 0 1 0 0 1 0 0"
)
df<-data.table::fread("
ID Jan Feb March April May Jun Jul Aug Sept Oct Nov Dec
ABC 0 0 0 1 0 0 0 0 1 0 0 0
DEF 0 0 0 1 1 0 0 0 1 0 0 0
GHI 0 0 0 1 0 1 0 0 0 1 0 0
MNO 0 0 0 1 0 1 0 0 1 0 0 0
QAL 0 1 1 1 0 0 1 0 0 1 0 0") %>% setDF
library(magrittr)
rowNames <- df[,1,drop=T]
months <- names(df[,-1])
fun1<-function(x) {
n <- 3 #at least 3 zeros (change if needed)
pos <- c(-1,cumsum(x)) %>% diff %>% as.logical %>% which
counts <- table(cumsum(x)) %>% as.numeric %>% {. > n & as.logical(x[pos])}
return(months[pos[counts]])
}
res <- apply(df[,-1],1,fun1)
names(res) <- rowNames
$ABC
[1] "April" "Sept"
$DEF
[1] "May" "Sept"
$GHI
[1] "Jun"
$MNO
[1] "Sept"
$QAL
character(0)