如何在R数据帧中获取独占计数_R_Dataframe

如何在R数据帧中获取独占计数

r dataframe

如何在R数据帧中获取独占计数,r,dataframe,R,Dataframe,我在下面提到了R中的数据帧： DF <- tibble::tribble( ~ID, ~Check, "I-1", "A1", "I-2", "A2", "I-2", "OT", "I-2", "LP", "I-3", "A1", "

我在下面提到了R中的数据帧：

DF <- tibble::tribble(
    ~ID, ~Check,
  "I-1",   "A1",
  "I-2",   "A2",
  "I-2",   "OT",
  "I-2",   "LP",
  "I-3",   "A1",
  "I-3",   "A2",
  "I-4",     NA,
  "I-5",     NA,
  "I-6",   "A1",
  "I-6",   "OT",
  "I-7",   "A2"
  )

DF2 <- tibble::tribble(
    ~ID,     ~Remarks,
  "I-1", "{X1,XR,XT}",
  "I-2",    "{X2,XR}",
  "I-3",           NA,
  "I-4", "{X1,XR,X2}",
  "I-5",       "{X1}",
  "I-6",       "{XT}",
  "I-7",    "{X1,X2}"
  )

我想这会满足你的要求。。。可能不是最简洁的，但似乎做到了

# Load Library
library('tidyverse')

### CHECK ###
# Load Check Table
DF <- tibble::tribble(
  ~ID, ~Check,
  "I-1",   "A1",
  "I-2",   "A2",
  "I-2",   "OT",
  "I-2",   "LP",
  "I-3",   "A1",
  "I-3",   "A2",
  "I-4",     NA,
  "I-5",     NA,
  "I-6",   "A1",
  "I-6",   "OT",
  "I-7",   "A2"
)

# Count by ID
DF <- DF %>%
  group_by(ID) %>%
  mutate(count = n())

# Count by Check
DF_X <- DF %>% dplyr::filter(count ==  1) %>%
  group_by(Check) %>%
  dplyr::summarize("Count" = sum(count))

# Identify unique values of Check
DF_UNIQUE <- unique(DF$Check)
DF_FIN <- data.frame("Check" = DF_UNIQUE,stringsAsFactors = FALSE)

# Join Counts by Check with unique list of Checks
DF_FIN <- left_join(x = DF_FIN, y = DF_X, by = "Check")

# Replace NA's with zeros
DF_FIN[is.na(DF_FIN$Count),2] <- 0

# Calculate Percentages
DF_FIN <- DF_FIN  %>%
    mutate("Check Percentage" = `Count`/sum(`Count`))

# Rename Columns
colnames(DF_FIN) <- c("Check", "Exclusive Count", "Check Percentage")

# Replace NA value with the word "BLANK"
DF_FIN[is.na(DF_FIN$Check),1] <- "BLANK"

# Sort by Exclusive Count and then by Check (alphabetical)
DF_FIN <- DF_FIN %>%
  arrange(desc(`Exclusive Count`), Check)

# Join Checks to itself and count instances
DF_CHECKS <- full_join(x = DF, y = DF, by = "ID")

DF_CHECKS <- DF_CHECKS %>%
  group_by(Check.x, Check.y) %>%
  dplyr::summarize("N" = n())

DF_CHECKS_SPREAD <- DF_CHECKS %>% 
  tidyr::pivot_wider(names_from = Check.y, values_from = N)
check_order <- DF_CHECKS_SPREAD$Check.x
check_order[is.na(check_order)] <- 'NA'
DF_CHECKS_SPREAD <- DF_CHECKS_SPREAD %>% select(check_order)

# Set the diagonal to zeros
for (i in 1:nrow(DF_CHECKS_SPREAD)){
  DF_CHECKS_SPREAD[i,i+1] <-0
}

# Rename Columns
colnames(DF_CHECKS_SPREAD)[1] <- "Check"
colnames(DF_CHECKS_SPREAD)[colnames(DF_CHECKS_SPREAD) == "NA"] <- "BLANK"

# Drop the BLANK column
DF_CHECKS_SPREAD$BLANK <- NULL

# Replace NA value with the word "BLANK"
DF_CHECKS_SPREAD[is.na(DF_CHECKS_SPREAD$Check),1] <- "BLANK"

# Replace all other NA's with zero
DF_CHECKS_SPREAD[is.na(DF_CHECKS_SPREAD)] <- 0

# Join the two Checks data sets together & calculate grand totals
FINAL_TABLE_CHECKS <- left_join(x = DF_FIN, y = DF_CHECKS_SPREAD, by = "Check")
FINAL_TABLE_CHECKS <- FINAL_TABLE_CHECKS %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"Total")))


### REMARKS ###
# Load Remarks table
DF2 <- tibble::tribble(
  ~ID,     ~Remarks,
  "I-1", "{X1,XR,XT}",
  "I-2",    "{X2,XR}",
  "I-3",           NA,
  "I-4", "{X1,XR,X2}",
  "I-5",       "{X1}",
  "I-6",       "{XT}",
  "I-7",    "{X1,X2}"
)

# Remove the {} from the Remarks string
DF2$Remarks <- str_replace_all(string = DF2$Remarks, c("\\{" = "", "\\}" = ""))

# Expand string into rows
DF2 <- separate_rows(DF2, Remarks, convert = TRUE)

# Group and count by ID
DF2 <- DF2 %>%
  group_by(ID) %>%
  mutate(count = n())

# Count by Remarks
DF2_X <- DF2 %>% dplyr::filter(count ==  1) %>%
  group_by(Remarks) %>%
  dplyr::summarize("Count" = sum(count))

# Identify unique Remarks
DF2_UNIQUE <- unique(DF2$Remarks)
DF2_FIN <- data.frame("Remarks" = DF2_UNIQUE,stringsAsFactors = FALSE)

# Join count of Remarks with unique list of Remarks
DF2_FIN <- left_join(x = DF2_FIN, y = DF2_X, by = "Remarks")

# Replace NA's with zeros
DF2_FIN[is.na(DF2_FIN$Count),2] <- 0

# Calculate Percentages
DF2_FIN <- DF2_FIN  %>%
  mutate("Remarks Percentage" = `Count`/sum(`Count`))

# Rename columns
colnames(DF2_FIN) <- c("Remarks", "Exclusive Count", "Remarks Percentage")

# Replace NA value with the word "BLANK"
DF2_FIN[is.na(DF2_FIN$Remarks),1] <- "BLANK"

# Sort by Exclusive Count and then by Check (alphabetical)
DF2_FIN <- DF2_FIN %>%
  arrange(desc(`Exclusive Count`), Remarks)

# Join Remarks to itself and count instances
DF_REMARKS <- full_join(x = DF2, y = DF2, by = "ID")
DF_REMARKS <- DF_REMARKS %>%
  group_by(Remarks.x, Remarks.y) %>%
  dplyr::summarize("N" = n())
DF_REMARKS_SPREAD <- DF_REMARKS %>% 
  tidyr::pivot_wider(names_from = Remarks.y, values_from = N)
check_order <- DF_REMARKS_SPREAD$Remarks.x
check_order[is.na(check_order)] <- 'NA'
DF_REMARKS_SPREAD <- DF_REMARKS_SPREAD %>% select(check_order)

# Set the diagonal to zeros
for (i in 1:nrow(DF_REMARKS_SPREAD)){
  DF_REMARKS_SPREAD[i,i+1] <-0
}

# Rename Columns
colnames(DF_REMARKS_SPREAD)[1] <- "Remarks"
colnames(DF_REMARKS_SPREAD)[colnames(DF_CHECKS_SPREAD) == "NA"] <- "BLANK"

# Drop the BLANK column
DF_REMARKS_SPREAD$BLANK <- NULL

# Replace NA value with the word "BLANK"
DF_REMARKS_SPREAD[is.na(DF_REMARKS_SPREAD$Remarks),1] <- "BLANK"

# Replace all other NA's with zero
DF_REMARKS_SPREAD[is.na(DF_REMARKS_SPREAD)] <- 0

# Join the two Remarks data sets together & calculate grand totals
FINAL_TABLE_REMARKS <- left_join(x = DF2_FIN, y = DF_REMARKS_SPREAD, by = "Remarks")
FINAL_TABLE_REMARKS <- FINAL_TABLE_REMARKS %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"Total")))

# Count Rows in Check and Remarks dataframes and add rows in dataframe
# with less rows to match # of rows in other.
checkRows <- nrow(FINAL_TABLE_CHECKS)
remarksRows <- nrow(FINAL_TABLE_REMARKS)
rowDiff <- abs(checkRows - remarksRows)

if(checkRows < remarksRows){
  cat("Adding", rowDiff , "rows to the Checks dataframe.\n\n")
  FINAL_TABLE_CHECKS[nrow(FINAL_TABLE_CHECKS)+rowDiff,] <- NA
  FINAL_TABLE_CHECKS[nrow(FINAL_TABLE_CHECKS),] <- FINAL_TABLE_CHECKS[checkRows,]
  FINAL_TABLE_CHECKS[checkRows,] <- NA
}else if(remarksRows < checkRows){
  cat("Adding", rowDiff , "rows to the Remarks dataframe.\n\n")
  FINAL_TABLE_REMARKS[nrow(FINAL_TABLE_REMARKS)+rowDiff,] <- NA
  FINAL_TABLE_REMARKS[nrow(FINAL_TABLE_REMARKS),] <- FINAL_TABLE_REMARKS[remarksRows,]
  FINAL_TABLE_REMARKS[remarksRows,] <- NA
}else{
  print("There is no difference in number of rows between Checks and Remarks.\n\n")
}


# Combine columns from Checks and Remarks into one table.
RESULTS <- cbind(FINAL_TABLE_REMARKS, FINAL_TABLE_CHECKS)
RESULTS$`Check Percentage` <- paste(round(100*RESULTS$`Check Percentage`,2), "%", sep="")
RESULTS$`Remarks Percentage` <- paste(round(100*RESULTS$`Remarks Percentage`,2), "%", sep="")
RESULTS

#加载库
库（'tidyverse'）
###检查###
#负荷检查表
DF%
变异（计数=n（））
#按支票计数
DF_X%dplyr:：过滤器（计数==1）%>%
分组人（检查）%>%
dplyr:：summary（“Count”=sum（Count））
#确定检查的唯一值
DF_UNIQUEOP要求一个规范的答案。因此，我创建了一个函数get_exclusive_counts（）
，它获取任何TIBLE、data.frame或data.table的前两列，其中第一列包含ID，第二列包含有效负载，例如长格式的Check

该函数独立于列名，可以处理有效负载列中任意数量的不同项。它为每个输入TIBLE返回一个data.table：
get_exclusive_counts(DF)

对于第二个用例DF2

，需要事先将有效负载拆分为单独的行：

library(magrittr)
DF2 %>% 
  dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>% 
  tidyr::separate_rows(Remarks) %>% 
  get_exclusive_counts()

请注意，结果表第一列的名称已从input data.frame中保留

OP提到，

备注

和

检查

的数量可能不同。因此，

cbind（）

这两个结果表实际上没有意义，因为这只会在行数相同的情况下给出合理的结果

此外，OP的预期结果有一些列名重复（至少

独占计数

，

，可能更多），这表明结果可能不用于进一步处理，而仅用于显示/打印

并排打印结果但是，我创建了一个函数

get\u exclusive\u counts\u-by\u-side（）

，它打印调用

get\u exclusive\u counts（）

对于任意数量的输入数据集
具有不同的行数，以及
最后一行（
```
总计
```
）对齐

该函数返回带有字符列的data.table

下面的调用将重现OP的预期结果：

get_exclusive_counts_side_by_side(
  DF2 %>% 
    dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>% 
    tidyr::separate_rows(Remarks),
  DF)

下面是另一个用例，演示它将处理不同的行和任意数量的输入数据集：

get_exclusive_counts_side_by_side(
  DF, 
  DF3 %>% 
    dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>% 
    tidyr::separate_rows(Remarks),
  DF)

函数定义代码看起来相当庞大，但有一半的行是注释。因此，代码应该是相当自解释的

此外，大约一半的代码行是由于OP的附加要求，如%列或总计行

补充说明如果我理解正确，排他计数指的是只分配了一个项目（或

NA

）的ID。这是相当直接的计算方法

计算每个ID的项目数

仅使用一个项目拾取ID

在input data.frame中选取属于这些ID的行（使用联接），并

计算独占行子集中项目的外观

此外，该功能处理OP的额外要求，这些要求超出了排他计数的识别范围：

添加剩余非排他项的共现计数矩阵排
在特定位置添加排他计数比例列，并将其格式化为百分比
添加总计行
将
```
NA
```
s替换为零或
```
“空白”
```
，分别

资料

DF到目前为止您尝试了什么？顺便说一句，请注意堆栈推荐系统给您的标签：我在这里看不到任何关于图形的内容，因此似乎是多余的（如果我错了，请纠正我）。只有一个空白用于备注为什么您有独占计数用于空白是2
？@SinhNguyen:对不起，这是我的错误…纠正了它。什么规则定义Blank
remark与Blank
Check对齐，X1
remark与A1
Check对齐，等等？@C Jeruzal-谢谢，在spread
函数中出错。错误是DF_CHECKS%>%spread中的错误（Check.y，N）：找不到函数“spread”
@user9211845-是的，我忘记了spread
已从tidyr
包中退出。我更新了代码，改为使用tidyr:：pivot\u
。现在应该可以了。由于列位置发生了变化，我还在pivot\u
之后添加了一个列排序。现在，它们应该与备注或检查的描述顺序相匹配。还添加了计算的总百分比和调整后的百分比列名称。@C Jeruzal-非常感谢，我在结果中遇到了一个错误
   Remarks Exclusive_Count       % X1 X2 XR XT
1:   Blank               1  33.33%  0  0  0  0
2:      X1               1  33.33%  0  2  2  1
3:      XT               1  33.33%  1  0  1  0
4:      X2               0   0.00%  2  0  2  0
5:      XR               0   0.00%  2  2  0  1
6:  Totals               3 100.00%  5  4  5  2

get_exclusive_counts_side_by_side(
  DF2 %>% 
    dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>% 
    tidyr::separate_rows(Remarks),
  DF)

   Remarks Exclusive_Count       % X1 X2 XR XT  Check Exclusive_Count       % A1 A2 LP OT
1:   Blank               1  33.33%  0  0  0  0  Blank               2  50.00%  0  0  0  0
2:      X1               1  33.33%  0  2  2  1     A1               1  25.00%  0  1  0  1
3:      XT               1  33.33%  1  0  1  0     A2               1  25.00%  1  0  1  1
4:      X2               0   0.00%  2  0  2  0     LP               0   0.00%  0  1  0  1
5:      XR               0   0.00%  2  2  0  1     OT               0   0.00%  1  1  1  0
6:  Totals               3 100.00%  5  4  5  2 Totals               4 100.00%  2  3  2  3

get_exclusive_counts_side_by_side(
  DF, 
  DF3 %>% 
    dplyr::mutate(Remarks = stringr::str_remove_all(Remarks, "[{}]")) %>% 
    tidyr::separate_rows(Remarks),
  DF)

    Check Exclusive_Count       % A1 A2 LP OT Remarks Exclusive_Count       % X1 X2 XR XT Y2 Y3 Y4  Check Exclusive_Count       % A1 A2 LP OT
1:  Blank               2  50.00%  0  0  0  0      X1               2  50.00%  0  2  2  1  1  1  0  Blank               2  50.00%  0  0  0  0
2:     A1               1  25.00%  0  1  0  1   Blank               1  25.00%  0  0  0  0  0  0  0     A1               1  25.00%  0  1  0  1
3:     A2               1  25.00%  1  0  1  1      XT               1  25.00%  1  0  1  0  0  0  0     A2               1  25.00%  1  0  1  1
4:     LP               0   0.00%  0  1  0  1      X2               0   0.00%  2  0  2  0  0  0  0     LP               0   0.00%  0  1  0  1
5:     OT               0   0.00%  1  1  1  0      XR               0   0.00%  2  2  0  1  0  0  0     OT               0   0.00%  1  1  1  0
6:                                                 Y2               0   0.00%  1  0  0  0  0  1  1                                         
7:                                                 Y3               0   0.00%  1  0  0  0  1  0  0                                         
8:                                                 Y4               0   0.00%  0  0  0  0  1  0  0                                         
9: Totals               4 100.00%  2  3  2  3  Totals               4 100.00%  7  4  5  2  3  2  1 Totals               4 100.00%  2  3  2  3

get_exclusive_counts <- function(DF) {
  library(data.table)
  library(magrittr)
  # make copy of first 2 cols to preserve original attributes of DF
  DT <- as.data.table(DF[, 1:2])
  # retain original column names
  old <- colnames(DT)[1:2]
  # rename colnames in copy for convenience of programming
  setnames(DT, c("id", "val")) # col 1 contains id, col 2 contains payload
  # aggregate by id to find exclusive counts = ids with only one element
  tmp <- DT[, .N, keyby = id][N == 1L]
  # create table of exclusive counts by joining and aggregating
  excl <- DT[tmp, on = .(id)][, .(Exclusive_Count = .N), keyby = val] %>% 
    # append column of proportions, will be formatted after computing Totals
    .[, `%` := Exclusive_Count / sum(Exclusive_Count)]
  # anti-join to find remaining rows
  rem <- DT[!tmp, on = .(id)]
  # create co-occurrence matrix in long format by a self-join
  coocc <-   rem[rem, on = .(id), allow.cartesian = TRUE] %>% 
    # reshape to wide format and compute counts of co-occurrences w/o diagonals
    dcast(val ~ i.val, length, subset = .(val != i.val))
  # build final result table by merging both subresults
  merge(excl, coocc, by = "val", all = TRUE) %>% 
    # replace NA counts by 0 
    .[, lapply(.SD, nafill, fill = 0L), by = val] %>% 
    # clean-up: order by decreasing Exclusive_Counts %>% 
    .[order(-Exclusive_Count)] %>% 
    # append Totals row
    rbind(., .[, c(.(val = "Totals"), lapply(.SD, sum)), .SDcols = is.numeric]) %>% 
    # clean-up: format proportion as percentage
    .[, `%` := sprintf("%3.2f%%", 100 * `%`)] %>% 
    # clean-up: Replace <NA> by "Blank" in val column
    .[is.na(val), val := "Blank"] %>%
    # rename val column
    setnames("val", old[2]) %>% 
    # return result visibly
    .[]
}

get_exclusive_counts_side_by_side <- function(...) {
  library(data.table)
  library(magrittr)
  # process input, return list of subresults
  ec_list<- list(...) %>% 
    lapply(get_exclusive_counts)
  # create row indices for maximum rows
  rid <- ec_list %>% 
    lapply(nrow) %>%
    Reduce(max, .) %>% 
    {data.table(.rowid = 1:.)}
  # combine subresults 
  ec_list %>% 
    # insert empty rows if necessary
    lapply(function(.x) .x[
      , .rowid := .I][
        # but align last row
        .rowid == .N, .rowid := nrow(rid)][
          rid, on =.(.rowid)][
            , .rowid := NULL]
    ) %>%  
    # all data.tables have the same number of rows, now cbind()
    do.call(cbind, .) %>% 
    # replace all NA by empty character strings
    .[, lapply(.SD, . %>% as.character %>% fifelse(is.na(.), "", .))]
}

DF <- tibble::tribble(
  ~ID, ~Check,
  "I-1",   "A1",
  "I-2",   "A2",
  "I-2",   "OT",
  "I-2",   "LP",
  "I-3",   "A1",
  "I-3",   "A2",
  "I-4",     NA,
  "I-5",     NA,
  "I-6",   "A1",
  "I-6",   "OT",
  "I-7",   "A2"
)

DF2 <- tibble::tribble(
  ~ID,     ~Remarks,
  "I-1", "{X1,XR,XT}",
  "I-2",    "{X2,XR}",
  "I-3",           NA,
  "I-4", "{X1,XR,X2}",
  "I-5",       "{X1}",
  "I-6",       "{XT}",
  "I-7",    "{X1,X2}"
)

DF3 <- tibble::tribble(
  ~ID,     ~Remarks,
  "I-1", "{X1,XR,XT}",
  "I-2",    "{X2,XR}",
  "I-3",           NA,
  "I-4", "{X1,XR,X2}",
  "I-5",       "{X1}",
  "I-6",       "{XT}",
  "I-7",    "{X1,X2}",
  "I-8", "{X1,Y2,Y3}",
  "I-9",    "{Y2,Y4}",
  "I10",       "{X1}",
)