基于条件匹配R中多列中的值_R_If Statement_String Matching

基于条件匹配R中多列中的值

r if-statement

基于条件匹配R中多列中的值,r,if-statement,string-matching,R,If Statement,String Matching,假设我有一个数据源我想根据以下参数向这个数据帧添加一个列df$response，我想我需要一组嵌套的ifelse，但我很难正确执行它：对于给定的X行，如果df$match=1 在df$响应中打印1，如果： df$match=0的df$match中的任何行在df$caseID、df$faculty和df$phase中的内容与第X行相同。否则，打印0 所以输出应该是这样的： response 1 0 0 1 0 0 因为只有第一行和第四行包含df$caseID、df$faculty和df$p

假设我有一个数据源

我想根据以下参数向这个数据帧添加一个列df$response，我想我需要一组嵌套的ifelse，但我很难正确执行它：

对于给定的X行，如果df$match=1

在df$响应中打印1，如果：

df$match=0的df$match中的任何行在df$caseID、df$faculty和df$phase中的内容与第X行相同。否则，打印0

所以输出应该是这样的：

response

1
0
0
1
0
0

因为只有第一行和第四行包含df$caseID、df$faculty和df$phase中匹配的值，其中df$match=1的行和df$match=0的行都有匹配的值。

在您的机器上使用[]进行索引速度更快，成本更低

df <- data.frame(
  "resident" = c("george","george","george","jane","jane","jane"),
  "faculty" = c("sally","sally","sally","carl","carl","carl"),
  "submittedBy" = c("george","sally","george","jane","carl","carl"),
  "match" = c(1,0,1,1,0,0),
  "caseID" = c("george_1","george_1","george_1","jane_1","jane_1","jane_1"),
  "phase" = c("pre","pre","intra","pre","pre","intra"),
  stringsAsFactors = FALSE
  )

response <- NULL

for (i in 1:nrow(df)) {
  response[i] <- ifelse(
    df$match[i] == 0, 0,
    ifelse(
      any(paste(df$caseID,df$faculty,df$phase,sep="")[df$match == 0] == 
            paste(df$caseID,df$faculty,df$phase,sep="")[i]),
      1, 0
    )
  )
}

response
[1] 1 0 0 1 0 0

在您的机器上，使用[]进行索引速度更快，成本更低

df <- data.frame(
  "resident" = c("george","george","george","jane","jane","jane"),
  "faculty" = c("sally","sally","sally","carl","carl","carl"),
  "submittedBy" = c("george","sally","george","jane","carl","carl"),
  "match" = c(1,0,1,1,0,0),
  "caseID" = c("george_1","george_1","george_1","jane_1","jane_1","jane_1"),
  "phase" = c("pre","pre","intra","pre","pre","intra"),
  stringsAsFactors = FALSE
  )

response <- NULL

for (i in 1:nrow(df)) {
  response[i] <- ifelse(
    df$match[i] == 0, 0,
    ifelse(
      any(paste(df$caseID,df$faculty,df$phase,sep="")[df$match == 0] == 
            paste(df$caseID,df$faculty,df$phase,sep="")[i]),
      1, 0
    )
  )
}

response
[1] 1 0 0 1 0 0

这是我该怎么做的

# read the data
test <- read.table(text = 'resident    faculty    submittedBy    match    caseID    phase
                   george      sally      george         1        george_1  pre
                   george      sally      sally          0        george_1  pre
                   george      sally      george         1        george_1  intra
                   jane        carl       jane           1        jane_1    pre
                   jane        carl       carl           0        jane_1    pre
                   jane        carl       carl           0        jane_1    intra', header=T)

# create the response
resp <- logical(0)

# iterate over each loop
for (rr in 1:nrow(test)){
  if (test$match[rr] == 0){
    resp[rr] <- 0
  }
  else{
    tmp <- rbind(test[-rr, c('faculty', 'caseID', 'phase')],  # add the onto the end
                 test[rr, c('faculty', 'caseID', 'phase')])   # test if line is duplicated
    resp[rr] <- ifelse(duplicated(tmp)[nrow(tmp)], 1, 0)
  }
}

这是我该怎么做的

# read the data
test <- read.table(text = 'resident    faculty    submittedBy    match    caseID    phase
                   george      sally      george         1        george_1  pre
                   george      sally      sally          0        george_1  pre
                   george      sally      george         1        george_1  intra
                   jane        carl       jane           1        jane_1    pre
                   jane        carl       carl           0        jane_1    pre
                   jane        carl       carl           0        jane_1    intra', header=T)

# create the response
resp <- logical(0)

# iterate over each loop
for (rr in 1:nrow(test)){
  if (test$match[rr] == 0){
    resp[rr] <- 0
  }
  else{
    tmp <- rbind(test[-rr, c('faculty', 'caseID', 'phase')],  # add the onto the end
                 test[rr, c('faculty', 'caseID', 'phase')])   # test if line is duplicated
    resp[rr] <- ifelse(duplicated(tmp)[nrow(tmp)], 1, 0)
  }
}

假设match中只有1和0个值，使用dplyr的一种方法是检查每个caseID、faculty和phase是否在match 1和0中有两个不同的值，并在match为0时将响应替换为0

我们可以使用data.table方法。将“data.frame”转换为“data.table”setDTdf1，按“caseID”、“faculty”、“phase”分组，如果匹配检查的唯一元素长度等于2，则获取该元素的长度，并创建一个二进制列“response”，对于“match”为0的值，将“response”分配给0

library(data.table)
setDT(df1)[, response := +((uniqueN(match) == 2) & match != 0), 
                  .(caseID, faculty, phase)][]
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre        1
#2:   george   sally       sally     0 george_1   pre        0
#3:   george   sally      george     1 george_1 intra        0
#4:     jane    carl        jane     1   jane_1   pre        1
#5:     jane    carl        carl     0   jane_1   pre        0
#6:     jane    carl        carl     0   jane_1 intra        0

或者使用带ave的base R

数据我们可以使用data.table方法。将“data.frame”转换为“data.table”setDTdf1，按“caseID”、“faculty”、“phase”分组，如果匹配检查的唯一元素长度等于2，则获取该元素的长度，并创建一个二进制列“response”，对于“match”为0的值，将“response”分配给0

library(data.table)
setDT(df1)[, response := +((uniqueN(match) == 2) & match != 0), 
                  .(caseID, faculty, phase)][]
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre        1
#2:   george   sally       sally     0 george_1   pre        0
#3:   george   sally      george     1 george_1 intra        0
#4:     jane    carl        jane     1   jane_1   pre        1
#5:     jane    carl        carl     0   jane_1   pre        0
#6:     jane    carl        carl     0   jane_1 intra        0

或者使用带ave的base R

数据另一种数据表方法。连接键变量并检查值是否不在match==0集合中：

另一种数据表方法。连接键变量并检查值是否不在match==0集合中：

我不确定我是否遵守……你能把标准分成单独的要点，以便更好地区分它们，并且在措辞上更清楚一点吗？也许exlicity会解释为什么前几个回答是这样的。@你看到了吗？@RAB我已经根据第1/4行的匹配情况进行了调整？第1行：教员=sally，案例ID=george_1，阶段=pre。这些都不匹配吗？我听不懂你的逻辑。他们认为匹配的是什么？@RAB我正在寻找匹配=1的X行，然后查看匹配=0的任何行是否与第X行的相位、系数值和案例ID第1行和第2行的值相同，因为其中一行匹配=1，另一行匹配=0，但相位、系数值，caseID sameim不确定我是否遵循……你能把标准分成单独的要点，以便更好地区分它们，并且在措辞上更清楚一点吗？也许exlicity会解释为什么前几个回答是这样的。@你看到了吗？@RAB我已经根据第1/4行的匹配情况进行了调整？第1行：教员=sally，案例ID=george_1，阶段=pre。这些都不匹配吗？我听不懂你的逻辑。他们认为匹配的是什么？@RAB我正在寻找匹配=1的X行，然后查看匹配=0的任何行是否与第X行的相位、系数值和案例ID第1行和第2行的值相同，因为其中一行匹配=1，另一行匹配=0，但相位、系数值，和caseID是sameI不断得到一个错误：这个代码的错误我不断得到一个错误：这个代码的错误我爱ave解决方案，非常简洁。你能解释它的工作原理吗？我从没见过它用那个way@RABave可用于替换data.table中dplyr或[，：=，by=]的group by+突变。它的工作方式是我们将第一个参数作为要修改的列/向量传递，这里它是match，随后的参数都是分组变量，直到有趣为止。如果我们没有通过任何的乐趣，默认情况下采取的平均值。好的是它不会改变原来的顺序。在本例中，我们传递匿名函数调用以检查每个组中唯一匹配元素的长度。比赛0&是将值0替换为与0love匹配的ave解决方案，非常简洁。你能解释它的工作原理吗？我从没见过它用那个way@RABave可用于替换data.table中dplyr或[，：=，by=]的group by+突变。它的工作方式是我们将第一个参数作为要修改的列/向量传递，这里它是match，随后的参数都是分组变量，直到有趣为止。如果我们没有通过任何的乐趣，默认情况下采取的平均值。好的是它不会改变原来的顺序。在本例中，我们传递匿名函数调用以检查每个组中唯一匹配元素的长度。比赛0将替换与0匹配的值0（&I）

df1 <- structure(list(resident = structure(c(1L, 1L, 1L, 2L, 2L, 2L), 
.Label = c("george", 
"jane"), class = "factor"), faculty = structure(c(2L, 2L, 2L, 
1L, 1L, 1L), .Label = c("carl", "sally"), class = "factor"), 
    submittedBy = structure(c(2L, 4L, 2L, 3L, 1L, 1L), .Label = c("carl", 
    "george", "jane", "sally"), class = "factor"), match = c(1L, 
    0L, 1L, 1L, 0L, 0L), caseID = structure(c(1L, 1L, 1L, 2L, 
    2L, 2L), .Label = c("george_1", "jane_1"), class = "factor"), 
    phase = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("intra", 
    "pre"), class = "factor")), class = "data.frame", row.names = c(NA, 
-6L))

library(data.table)
setDT(dat)

dat[, response := match==1]
dat[!dat[match==0], on=c("caseID","faculty","phase"), response := FALSE]

dat
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre     TRUE
#2:   george   sally       sally     0 george_1   pre    FALSE
#3:   george   sally      george     1 george_1 intra    FALSE
#4:     jane    carl        jane     1   jane_1   pre     TRUE
#5:     jane    carl        carl     0   jane_1   pre    FALSE
#6:     jane    carl        carl     0   jane_1 intra    FALSE