如何在R数据帧中获取独占计数
我在下面提到了R中的数据帧:如何在R数据帧中获取独占计数,r,dataframe,R,Dataframe,我在下面提到了R中的数据帧: DF <- tibble::tribble( ~ID, ~Check, "I-1", "A1", "I-2", "A2", "I-2", "OT", "I-2", "LP", "I-3", "A1", "
DF <- tibble::tribble(
~ID, ~Check,
"I-1", "A1",
"I-2", "A2",
"I-2", "OT",
"I-2", "LP",
"I-3", "A1",
"I-3", "A2",
"I-4", NA,
"I-5", NA,
"I-6", "A1",
"I-6", "OT",
"I-7", "A2"
)
DF2 <- tibble::tribble(
~ID, ~Remarks,
"I-1", "{X1,XR,XT}",
"I-2", "{X2,XR}",
"I-3", NA,
"I-4", "{X1,XR,X2}",
"I-5", "{X1}",
"I-6", "{XT}",
"I-7", "{X1,X2}"
)
我想这会满足你的要求。。。可能不是最简洁的,但似乎做到了
# Load Library
library('tidyverse')
### CHECK ###
# Load Check Table
DF <- tibble::tribble(
~ID, ~Check,
"I-1", "A1",
"I-2", "A2",
"I-2", "OT",
"I-2", "LP",
"I-3", "A1",
"I-3", "A2",
"I-4", NA,
"I-5", NA,
"I-6", "A1",
"I-6", "OT",
"I-7", "A2"
)
# Count by ID
DF <- DF %>%
group_by(ID) %>%
mutate(count = n())
# Count by Check
DF_X <- DF %>% dplyr::filter(count == 1) %>%
group_by(Check) %>%
dplyr::summarize("Count" = sum(count))
# Identify unique values of Check
DF_UNIQUE <- unique(DF$Check)
DF_FIN <- data.frame("Check" = DF_UNIQUE,stringsAsFactors = FALSE)
# Join Counts by Check with unique list of Checks
DF_FIN <- left_join(x = DF_FIN, y = DF_X, by = "Check")
# Replace NA's with zeros
DF_FIN[is.na(DF_FIN$Count),2] <- 0
# Calculate Percentages
DF_FIN <- DF_FIN %>%
mutate("Check Percentage" = `Count`/sum(`Count`))
# Rename Columns
colnames(DF_FIN) <- c("Check", "Exclusive Count", "Check Percentage")
# Replace NA value with the word "BLANK"
DF_FIN[is.na(DF_FIN$Check),1] <- "BLANK"
# Sort by Exclusive Count and then by Check (alphabetical)
DF_FIN <- DF_FIN %>%
arrange(desc(`Exclusive Count`), Check)
# Join Checks to itself and count instances
DF_CHECKS <- full_join(x = DF, y = DF, by = "ID")
DF_CHECKS <- DF_CHECKS %>%
group_by(Check.x, Check.y) %>%
dplyr::summarize("N" = n())
DF_CHECKS_SPREAD <- DF_CHECKS %>%
tidyr::pivot_wider(names_from = Check.y, values_from = N)
check_order <- DF_CHECKS_SPREAD$Check.x
check_order[is.na(check_order)] <- 'NA'
DF_CHECKS_SPREAD <- DF_CHECKS_SPREAD %>% select(check_order)
# Set the diagonal to zeros
for (i in 1:nrow(DF_CHECKS_SPREAD)){
DF_CHECKS_SPREAD[i,i+1] <-0
}
# Rename Columns
colnames(DF_CHECKS_SPREAD)[1] <- "Check"
colnames(DF_CHECKS_SPREAD)[colnames(DF_CHECKS_SPREAD) == "NA"] <- "BLANK"
# Drop the BLANK column
DF_CHECKS_SPREAD$BLANK <- NULL
# Replace NA value with the word "BLANK"
DF_CHECKS_SPREAD[is.na(DF_CHECKS_SPREAD$Check),1] <- "BLANK"
# Replace all other NA's with zero
DF_CHECKS_SPREAD[is.na(DF_CHECKS_SPREAD)] <- 0
# Join the two Checks data sets together & calculate grand totals
FINAL_TABLE_CHECKS <- left_join(x = DF_FIN, y = DF_CHECKS_SPREAD, by = "Check")
FINAL_TABLE_CHECKS <- FINAL_TABLE_CHECKS %>%
bind_rows(summarise(.,
across(where(is.numeric), sum),
across(where(is.character), ~"Total")))
### REMARKS ###
# Load Remarks table
DF2 <- tibble::tribble(
~ID, ~Remarks,
"I-1", "{X1,XR,XT}",
"I-2", "{X2,XR}",
"I-3", NA,
"I-4", "{X1,XR,X2}",
"I-5", "{X1}",
"I-6", "{XT}",
"I-7", "{X1,X2}"
)
# Remove the {} from the Remarks string
DF2$Remarks <- str_replace_all(string = DF2$Remarks, c("\\{" = "", "\\}" = ""))
# Expand string into rows
DF2 <- separate_rows(DF2, Remarks, convert = TRUE)
# Group and count by ID
DF2 <- DF2 %>%
group_by(ID) %>%
mutate(count = n())
# Count by Remarks
DF2_X <- DF2 %>% dplyr::filter(count == 1) %>%
group_by(Remarks) %>%
dplyr::summarize("Count" = sum(count))
# Identify unique Remarks
DF2_UNIQUE <- unique(DF2$Remarks)
DF2_FIN <- data.frame("Remarks" = DF2_UNIQUE,stringsAsFactors = FALSE)
# Join count of Remarks with unique list of Remarks
DF2_FIN <- left_join(x = DF2_FIN, y = DF2_X, by = "Remarks")
# Replace NA's with zeros
DF2_FIN[is.na(DF2_FIN$Count),2] <- 0
# Calculate Percentages
DF2_FIN <- DF2_FIN %>%
mutate("Remarks Percentage" = `Count`/sum(`Count`))
# Rename columns
colnames(DF2_FIN) <- c("Remarks", "Exclusive Count", "Remarks Percentage")
# Replace NA value with the word "BLANK"
DF2_FIN[is.na(DF2_FIN$Remarks),1] <- "BLANK"
# Sort by Exclusive Count and then by Check (alphabetical)
DF2_FIN <- DF2_FIN %>%
arrange(desc(`Exclusive Count`), Remarks)
# Join Remarks to itself and count instances
DF_REMARKS <- full_join(x = DF2, y = DF2, by = "ID")
DF_REMARKS <- DF_REMARKS %>%
group_by(Remarks.x, Remarks.y) %>%
dplyr::summarize("N" = n())
DF_REMARKS_SPREAD <- DF_REMARKS %>%
tidyr::pivot_wider(names_from = Remarks.y, values_from = N)
check_order <- DF_REMARKS_SPREAD$Remarks.x
check_order[is.na(check_order)] <- 'NA'
DF_REMARKS_SPREAD <- DF_REMARKS_SPREAD %>% select(check_order)
# Set the diagonal to zeros
for (i in 1:nrow(DF_REMARKS_SPREAD)){
DF_REMARKS_SPREAD[i,i+1] <-0
}
# Rename Columns
colnames(DF_REMARKS_SPREAD)[1] <- "Remarks"
colnames(DF_REMARKS_SPREAD)[colnames(DF_CHECKS_SPREAD) == "NA"] <- "BLANK"
# Drop the BLANK column
DF_REMARKS_SPREAD$BLANK <- NULL
# Replace NA value with the word "BLANK"
DF_REMARKS_SPREAD[is.na(DF_REMARKS_SPREAD$Remarks),1] <- "BLANK"
# Replace all other NA's with zero
DF_REMARKS_SPREAD[is.na(DF_REMARKS_SPREAD)] <- 0
# Join the two Remarks data sets together & calculate grand totals
FINAL_TABLE_REMARKS <- left_join(x = DF2_FIN, y = DF_REMARKS_SPREAD, by = "Remarks")
FINAL_TABLE_REMARKS <- FINAL_TABLE_REMARKS %>%
bind_rows(summarise(.,
across(where(is.numeric), sum),
across(where(is.character), ~"Total")))
# Count Rows in Check and Remarks dataframes and add rows in dataframe
# with less rows to match # of rows in other.
checkRows <- nrow(FINAL_TABLE_CHECKS)
remarksRows <- nrow(FINAL_TABLE_REMARKS)
rowDiff <- abs(checkRows - remarksRows)
if(checkRows < remarksRows){
cat("Adding", rowDiff , "rows to the Checks dataframe.\n\n")
FINAL_TABLE_CHECKS[nrow(FINAL_TABLE_CHECKS)+rowDiff,] <- NA
FINAL_TABLE_CHECKS[nrow(FINAL_TABLE_CHECKS),] <- FINAL_TABLE_CHECKS[checkRows,]
FINAL_TABLE_CHECKS[checkRows,] <- NA
}else if(remarksRows < checkRows){
cat("Adding", rowDiff , "rows to the Remarks dataframe.\n\n")
FINAL_TABLE_REMARKS[nrow(FINAL_TABLE_REMARKS)+rowDiff,] <- NA
FINAL_TABLE_REMARKS[nrow(FINAL_TABLE_REMARKS),] <- FINAL_TABLE_REMARKS[remarksRows,]
FINAL_TABLE_REMARKS[remarksRows,] <- NA
}else{
print("There is no difference in number of rows between Checks and Remarks.\n\n")
}
# Combine columns from Checks and Remarks into one table.
RESULTS <- cbind(FINAL_TABLE_REMARKS, FINAL_TABLE_CHECKS)
RESULTS$`Check Percentage` <- paste(round(100*RESULTS$`Check Percentage`,2), "%", sep="")
RESULTS$`Remarks Percentage` <- paste(round(100*RESULTS$`Remarks Percentage`,2), "%", sep="")
RESULTS
#加载库
库('tidyverse')
###检查###
#负荷检查表
DF%
变异(计数=n())
#按支票计数
DF_X%dplyr::过滤器(计数==1)%>%
分组人(检查)%>%
dplyr::summary(“Count”=sum(Count))
#确定检查的唯一值
DF_UNIQUEOP要求一个规范的答案。因此,我创建了一个函数get_exclusive_counts()
,它获取任何TIBLE、data.frame或data.table的前两列,其中第一列包含ID,第二列包含有效负载,例如长格式的Check
该函数独立于列名,可以处理有效负载列中任意数量的不同项。它为每个输入TIBLE返回一个data.table:
get_exclusive_counts(DF)
对于第二个用例DF2
,需要事先将有效负载拆分为单独的行:
library(magrittr)
DF2 %>%
dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>%
tidyr::separate_rows(Remarks) %>%
get_exclusive_counts()
请注意,结果表第一列的名称已从input data.frame中保留
OP提到,备注
和检查
的数量可能不同。因此,cbind()
这两个结果表实际上没有意义,因为这只会在行数相同的情况下给出合理的结果
此外,OP的预期结果有一些列名重复(至少独占计数
,%
,可能更多),这表明结果可能不用于进一步处理,而仅用于显示/打印
并排打印结果
但是,我创建了一个函数get\u exclusive\u counts\u-by\u-side()
,它打印调用get\u exclusive\u counts()
- 对于任意数量的输入数据集
- 具有不同的行数,以及
- 最后一行(
)对齐总计
get_exclusive_counts_side_by_side(
DF2 %>%
dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>%
tidyr::separate_rows(Remarks),
DF)
下面是另一个用例,演示它将处理不同的行和任意数量的输入数据集:
get_exclusive_counts_side_by_side(
DF,
DF3 %>%
dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>%
tidyr::separate_rows(Remarks),
DF)
函数定义
代码看起来相当庞大,但有一半的行是注释。因此,代码应该是相当自解释的
此外,大约一半的代码行是由于OP的附加要求,如%列或总计行
补充说明
如果我理解正确,排他计数指的是只分配了一个项目(或NA
)的ID。这是相当直接的计算方法
- 添加剩余非排他项的共现计数矩阵 排
- 在特定位置添加排他计数比例列,并将其格式化为百分比
- 添加总计行
- 将
s替换为零或NA
,分别“空白”
DF到目前为止您尝试了什么?顺便说一句,请注意堆栈推荐系统给您的标签:我在这里看不到任何关于图形的内容,因此似乎是多余的(如果我错了,请纠正我)。只有一个空白用于备注为什么您有独占计数用于空白是2
?@SinhNguyen:对不起,这是我的错误…纠正了它。什么规则定义Blank
remark与Blank
Check对齐,X1
remark与A1
Check对齐,等等?@C Jeruzal-谢谢,在spread
函数中出错。错误是DF_CHECKS%>%spread中的错误(Check.y,N):找不到函数“spread”
@user9211845-是的,我忘记了spread
已从tidyr
包中退出。我更新了代码,改为使用tidyr::pivot\u
。现在应该可以了。由于列位置发生了变化,我还在pivot\u
之后添加了一个列排序。现在,它们应该与备注或检查的描述顺序相匹配。还添加了计算的总百分比和调整后的百分比列名称。@C Jeruzal-非常感谢,我在结果中遇到了一个错误
Remarks Exclusive_Count % X1 X2 XR XT
1: Blank 1 33.33% 0 0 0 0
2: X1 1 33.33% 0 2 2 1
3: XT 1 33.33% 1 0 1 0
4: X2 0 0.00% 2 0 2 0
5: XR 0 0.00% 2 2 0 1
6: Totals 3 100.00% 5 4 5 2
get_exclusive_counts_side_by_side(
DF2 %>%
dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>%
tidyr::separate_rows(Remarks),
DF)
Remarks Exclusive_Count % X1 X2 XR XT Check Exclusive_Count % A1 A2 LP OT
1: Blank 1 33.33% 0 0 0 0 Blank 2 50.00% 0 0 0 0
2: X1 1 33.33% 0 2 2 1 A1 1 25.00% 0 1 0 1
3: XT 1 33.33% 1 0 1 0 A2 1 25.00% 1 0 1 1
4: X2 0 0.00% 2 0 2 0 LP 0 0.00% 0 1 0 1
5: XR 0 0.00% 2 2 0 1 OT 0 0.00% 1 1 1 0
6: Totals 3 100.00% 5 4 5 2 Totals 4 100.00% 2 3 2 3
get_exclusive_counts_side_by_side(
DF,
DF3 %>%
dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>%
tidyr::separate_rows(Remarks),
DF)
Check Exclusive_Count % A1 A2 LP OT Remarks Exclusive_Count % X1 X2 XR XT Y2 Y3 Y4 Check Exclusive_Count % A1 A2 LP OT
1: Blank 2 50.00% 0 0 0 0 X1 2 50.00% 0 2 2 1 1 1 0 Blank 2 50.00% 0 0 0 0
2: A1 1 25.00% 0 1 0 1 Blank 1 25.00% 0 0 0 0 0 0 0 A1 1 25.00% 0 1 0 1
3: A2 1 25.00% 1 0 1 1 XT 1 25.00% 1 0 1 0 0 0 0 A2 1 25.00% 1 0 1 1
4: LP 0 0.00% 0 1 0 1 X2 0 0.00% 2 0 2 0 0 0 0 LP 0 0.00% 0 1 0 1
5: OT 0 0.00% 1 1 1 0 XR 0 0.00% 2 2 0 1 0 0 0 OT 0 0.00% 1 1 1 0
6: Y2 0 0.00% 1 0 0 0 0 1 1
7: Y3 0 0.00% 1 0 0 0 1 0 0
8: Y4 0 0.00% 0 0 0 0 1 0 0
9: Totals 4 100.00% 2 3 2 3 Totals 4 100.00% 7 4 5 2 3 2 1 Totals 4 100.00% 2 3 2 3
get_exclusive_counts <- function(DF) {
library(data.table)
library(magrittr)
# make copy of first 2 cols to preserve original attributes of DF
DT <- as.data.table(DF[, 1:2])
# retain original column names
old <- colnames(DT)[1:2]
# rename colnames in copy for convenience of programming
setnames(DT, c("id", "val")) # col 1 contains id, col 2 contains payload
# aggregate by id to find exclusive counts = ids with only one element
tmp <- DT[, .N, keyby = id][N == 1L]
# create table of exclusive counts by joining and aggregating
excl <- DT[tmp, on = .(id)][, .(Exclusive_Count = .N), keyby = val] %>%
# append column of proportions, will be formatted after computing Totals
.[, `%` := Exclusive_Count / sum(Exclusive_Count)]
# anti-join to find remaining rows
rem <- DT[!tmp, on = .(id)]
# create co-occurrence matrix in long format by a self-join
coocc <- rem[rem, on = .(id), allow.cartesian = TRUE] %>%
# reshape to wide format and compute counts of co-occurrences w/o diagonals
dcast(val ~ i.val, length, subset = .(val != i.val))
# build final result table by merging both subresults
merge(excl, coocc, by = "val", all = TRUE) %>%
# replace NA counts by 0
.[, lapply(.SD, nafill, fill = 0L), by = val] %>%
# clean-up: order by decreasing Exclusive_Counts %>%
.[order(-Exclusive_Count)] %>%
# append Totals row
rbind(., .[, c(.(val = "Totals"), lapply(.SD, sum)), .SDcols = is.numeric]) %>%
# clean-up: format proportion as percentage
.[, `%` := sprintf("%3.2f%%", 100 * `%`)] %>%
# clean-up: Replace <NA> by "Blank" in val column
.[is.na(val), val := "Blank"] %>%
# rename val column
setnames("val", old[2]) %>%
# return result visibly
.[]
}
get_exclusive_counts_side_by_side <- function(...) {
library(data.table)
library(magrittr)
# process input, return list of subresults
ec_list<- list(...) %>%
lapply(get_exclusive_counts)
# create row indices for maximum rows
rid <- ec_list %>%
lapply(nrow) %>%
Reduce(max, .) %>%
{data.table(.rowid = 1:.)}
# combine subresults
ec_list %>%
# insert empty rows if necessary
lapply(function(.x) .x[
, .rowid := .I][
# but align last row
.rowid == .N, .rowid := nrow(rid)][
rid, on =.(.rowid)][
, .rowid := NULL]
) %>%
# all data.tables have the same number of rows, now cbind()
do.call(cbind, .) %>%
# replace all NA by empty character strings
.[, lapply(.SD, . %>% as.character %>% fifelse(is.na(.), "", .))]
}
DF <- tibble::tribble(
~ID, ~Check,
"I-1", "A1",
"I-2", "A2",
"I-2", "OT",
"I-2", "LP",
"I-3", "A1",
"I-3", "A2",
"I-4", NA,
"I-5", NA,
"I-6", "A1",
"I-6", "OT",
"I-7", "A2"
)
DF2 <- tibble::tribble(
~ID, ~Remarks,
"I-1", "{X1,XR,XT}",
"I-2", "{X2,XR}",
"I-3", NA,
"I-4", "{X1,XR,X2}",
"I-5", "{X1}",
"I-6", "{XT}",
"I-7", "{X1,X2}"
)
DF3 <- tibble::tribble(
~ID, ~Remarks,
"I-1", "{X1,XR,XT}",
"I-2", "{X2,XR}",
"I-3", NA,
"I-4", "{X1,XR,X2}",
"I-5", "{X1}",
"I-6", "{XT}",
"I-7", "{X1,X2}",
"I-8", "{X1,Y2,Y3}",
"I-9", "{Y2,Y4}",
"I10", "{X1}",
)