R 如何基于(元素方向)选定的相邻列计算行重复计数
我有一个数据帧测试:R 如何基于(元素方向)选定的相邻列计算行重复计数,r,count,duplicates,elementwise-operations,rowwise,R,Count,Duplicates,Elementwise Operations,Rowwise,我有一个数据帧测试: group userID A_conf A_chall B_conf B_chall 1 220 1 1 1 2 1 222 4 6 4 4 2 223 6 5 3 2 1 224 1 5 4 4 2 228 4
group userID A_conf A_chall B_conf B_chall
1 220 1 1 1 2
1 222 4 6 4 4
2 223 6 5 3 2
1 224 1 5 4 4
2 228 4 4 4 4
数据包含每个用户的响应(由userID显示),其中每个用户可以为以下两个度量输入1到6之间的任何值:
- 形态
- 查尔
- A_conf和A_chall
- B_conf和B_chall
- dput(测试): 结构(列表)(组=c(1L,1L,2L,1L,2L) userID=c(220L、222L、223L、224L、228L) A_形态=c(1L,4L,6L,1L,4L) A_chall=c(1L,6L,5L,5L,4L) B_conf=c(1L,4L,3L,4L,4L) B_chall=c(2L,4L,2L,4L,4L)) class=“data.frame”,row.names=c(NA,-5L))
test$Final = as.integer(0) # add a column to keep counts
count_inc = as.integer(0) # counter variable to increment in steps of 1
for (i in 1:nrow(test)) {
count_inc = 0
if(!is.na(test$A_conf[i] == test$A_chall[i]))
{
count_inc = 1
test$Final[i] = count_inc
}#if
else if(!is.na(test$A_conf[i] != test$A_chall[i]))
{
count_inc = 0
test$Final[i] = count_inc
}#else if
}#for
上述代码仅用于A_conf和A_chall列。问题是,无论输入的值(由用户输入)是否相等,它都会用所有1填充Final列 使用
tidyverse
可以执行以下操作:
df %>%
select(-Final) %>%
rowid_to_column() %>% #Creating an unique row ID
gather(var, val, -c(group, userID, rowid)) %>% #Reshaping the data
arrange(rowid, var) %>% #Arranging by row ID and by variables
group_by(rowid) %>% #Grouping by row ID
mutate(temp = gl(n()/2, 2)) %>% #Creating a grouping variable for different "_chall" and "_conf" variables
group_by(rowid, temp) %>% #Grouping by row ID and the new grouping variables
mutate(res = ifelse(val == lag(val), 1, 0)) %>% #Comparing whether the different "_chall" and "_conf" have the same value
group_by(rowid) %>% #Grouping by row ID
mutate(res = sum(res, na.rm = TRUE)) %>% #Summing the occurrences of "_chall" and "_conf" being the same
select(-temp) %>%
spread(var, val) %>% #Returning the data to its original form
ungroup() %>%
select(-rowid)
group userID res A_chall A_conf B_chall B_conf
<int> <int> <dbl> <int> <int> <int> <int>
1 1 220 1. 1 1 2 1
2 1 222 1. 6 4 4 4
3 2 223 0. 5 6 2 3
4 1 224 1. 5 1 4 4
5 2 228 2. 4 4 4 4
df%>%
选择(-Final)%>%
rowid_to_column()%>%#创建唯一的行ID
聚集(var,val,-c(组,用户ID,行ID))%>%#重新格式化数据
排列(rowid,var)%>%#按行ID和变量排列
分组依据(行ID)%>%#分组依据行ID
mutate(temp=gl(n()/2,2))%>%#为不同的“_chall”和“_conf”变量创建分组变量
group_by(rowid,temp)%>%#按行ID和新分组变量分组
比较不同的“_chall”和“_conf”是否具有相同的值
分组依据(行ID)%>%#分组依据行ID
mutate(res=sum(res,na.rm=TRUE))%>%#将“_chall”和“_conf”的出现次数相加
选择(-temp)%>%
价差(var,val)%>%#将数据返回其原始形式
解组()%>%
选择(-rowid)
组用户ID resa_chall A_conf B_chall B_conf
1 1 220 1. 1 1 2 1
2 1 222 1. 6 4 4 4
3 2 223 0. 5 6 2 3
4 1 224 1. 5 1 4 4
5 2 228 2. 4 4 4 4
您也可以试试这款tidyverse。与另一个答案相比,少了几行;)
库(tidyverse)
d%>%
as.tible()%>%
聚集(k,v,-组,-用户ID)%>%
分开(k,分成=c(“字母”,“测试”))%>%
排列(测试,v)%>%
分组人(用户ID)%>%
突变(final=sum(chall==conf))%>%
不同的(用户ID,最终版)%>%
解组()%>%
右联合(d)
#一个tibble:5x7
用户ID最终组A_conf A_chall B_conf B_chall
1 220 1 1 1 1 1 2
2 222 1 1 4 6 4 4
3 223 0 2 6 5 3 2
4 224 1 1 1 5 4 4
5 228 2 2 4 4 4 4
假设“conf”和“chall”列的数量相等,则基本R解决方案
非常感谢,建议的解决方案很容易遵循。只是一个简单的问题,如何处理NAs。在我最初的问题中,我忘了提到一些条目丢失并且包含NAs。@Sadiaz
rowsumes
获得了na.rm
参数,您可以将其设置为TRUE
像rowSums(test[conf\u col]==test[chall\u col],na.rm=TRUE)
df %>%
select(-Final) %>%
rowid_to_column() %>% #Creating an unique row ID
gather(var, val, -c(group, userID, rowid)) %>% #Reshaping the data
arrange(rowid, var) %>% #Arranging by row ID and by variables
group_by(rowid) %>% #Grouping by row ID
mutate(temp = gl(n()/2, 2)) %>% #Creating a grouping variable for different "_chall" and "_conf" variables
group_by(rowid, temp) %>% #Grouping by row ID and the new grouping variables
mutate(res = ifelse(val == lag(val), 1, 0)) %>% #Comparing whether the different "_chall" and "_conf" have the same value
group_by(rowid) %>% #Grouping by row ID
mutate(res = sum(res, na.rm = TRUE)) %>% #Summing the occurrences of "_chall" and "_conf" being the same
select(-temp) %>%
spread(var, val) %>% #Returning the data to its original form
ungroup() %>%
select(-rowid)
group userID res A_chall A_conf B_chall B_conf
<int> <int> <dbl> <int> <int> <int> <int>
1 1 220 1. 1 1 2 1
2 1 222 1. 6 4 4 4
3 2 223 0. 5 6 2 3
4 1 224 1. 5 1 4 4
5 2 228 2. 4 4 4 4
library(tidyverse)
d %>%
as.tibble() %>%
gather(k, v, -group,-userID) %>%
separate(k, into = c("letters", "test")) %>%
spread(test, v) %>%
group_by(userID) %>%
mutate(final = sum(chall == conf)) %>%
distinct(userID, final) %>%
ungroup() %>%
right_join(d)
# A tibble: 5 x 7
userID final group A_conf A_chall B_conf B_chall
<int> <int> <int> <int> <int> <int> <int>
1 220 1 1 1 1 1 2
2 222 1 1 4 6 4 4
3 223 0 2 6 5 3 2
4 224 1 1 1 5 4 4
5 228 2 2 4 4 4 4
#Find indexes of "conf" column
conf_col <- grep("conf", names(test))
#Find indexes of "chall" column
chall_col <- grep("chall", names(test))
#compare element wise and take row wise sum
test$Final <- rowSums(test[conf_col] == test[chall_col])
test
# group userID A_conf A_chall B_conf B_chall Final
#1 1 220 1 1 1 2 1
#2 1 222 4 6 4 4 1
#3 2 223 6 5 3 2 0
#4 1 224 1 5 4 4 1
#5 2 228 4 4 4 4 2
rowSums(test[grep("conf", names(test))] == test[grep("chall", names(test))])