Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/72.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
如何基于应用于大量列的“不等于”标准对dataframe进行子集划分?_R_Dataframe_Filter_Subset - Fatal编程技术网

如何基于应用于大量列的“不等于”标准对dataframe进行子集划分?

如何基于应用于大量列的“不等于”标准对dataframe进行子集划分?,r,dataframe,filter,subset,R,Dataframe,Filter,Subset,我是R新手,目前正试图根据预定义的排除标准对数据进行子集分析。我目前正试图排除ICD-10编码的所有痴呆症病例。问题是,有多个变量包含每个个体疾病状态的信息~70个变量,尽管它们以相同的方式编码,相同的条件可以应用于所有这些变量 一些模拟数据: #Create dataframe containing simulated data df = data.frame(ID = c(1001, 1002, 1003, 1004, 1005,1006,1007,1008,1009,1010,1011),

我是R新手,目前正试图根据预定义的排除标准对数据进行子集分析。我目前正试图排除ICD-10编码的所有痴呆症病例。问题是,有多个变量包含每个个体疾病状态的信息~70个变量,尽管它们以相同的方式编码,相同的条件可以应用于所有这些变量

一些模拟数据:

#Create dataframe containing simulated data
df = data.frame(ID = c(1001, 1002, 1003, 1004, 1005,1006,1007,1008,1009,1010,1011),
                    disease_code_1 = c('I802','H356','G560','D235','B178','F011','F023','C761','H653','A049','J679'),
                    disease_code_2 = c('A071','NA','G20','NA','NA','A049','NA','NA','G300','G308','A045'),
                    disease_code_3 = c('H250','NA','NA','I802','NA','A481','NA','NA','NA','NA','D352'))

#data is structured as below:

     ID disease_code_1 disease_code_2 disease_code_3
1  1001           I802           A071           H250
2  1002           H356             NA             NA
3  1003           G560            G20             NA
4  1004           D235             NA           I802
5  1005           B178             NA             NA
6  1006           F011           A049           A481
7  1007           F023             NA             NA
8  1008           C761             NA             NA
9  1009           H653           G300             NA
10 1010           A049           G308             NA
11 1011           J679           A045           D352


在这里,我试图删除任何疾病代码变量中有“痴呆代码”的病例

#Remove cases with dementia from dataframe (e.g. F023, G20)
Newdata_df <- subset(df, (2:4 != "F023"|"G20"|"F009"|"F002"|"F001"|"F000"|"F00"|    
                    "G309"| "G308"|"G301"|"G300"|"G30"| "F01"|"F018"|"F013"|
                    "F012"| "F011"| "F010"|"F01"))
理想情况下,子集数据帧如下所示:

     ID disease_code_1 disease_code_2 disease_code_3
1  1001           I802           A071           H250
2  1002           H356             NA             NA
4  1004           D235             NA           I802
5  1005           B178             NA             NA
8  1008           C761             NA             NA
11 1011           J679           A045           D352
我知道我的代码中有一个错误,尽管我不知道如何准确地修复它。我已经尝试了其他一些使用dplyr的方法,尽管到目前为止还没有任何运气

非常感谢您的帮助

这个怎么样:

> dementia <- c("F023", "G20", "F009", "F002", "F001", "F000", "F00", "G309", "G308",
+               "G301", "G300", "G30", "F01", "F018", "F013", "F012", "F011", "F010", "F01")
> 
> dementia <- apply(sapply(df[, -1], function(x) {x %in% dementia}), 1, any)
> 
> df[!dementia,]
     ID disease_code_1 disease_code_2 disease_code_3
1  1001           I802           A071           H250
2  1002           H356             NA             NA
4  1004           D235             NA           I802
5  1005           B178             NA             NA
8  1008           C761             NA             NA
11 1011           J679           A045           D352
> 

希望有帮助。

我们可以创建一个包含要删除代码的向量,并使用行和来删除,即

codes_to_remove <- c("F023", "G20", "F009", "F002", "F001", "F000", "F00", "G309", "G308",
                "G301", "G300", "G30", "F01", "F018", "F013", "F012", "F011", "F010", "F01")

df[rowSums(sapply(df[-1], `%in%`, codes_to_remove)) == 0,]
dplyr的一种可能性是:

df %>%
 filter_at(vars(2:4), all_vars(! . %in% c("F023","G20","F009","F002","F001","F000","F00",    
            "G309", "G308","G301","G300","G30", "F01","F018","F013",
            "F012", "F011", "F010","F01")))

    ID disease_code_1 disease_code_2 disease_code_3
1 1001           I802           A071           H250
2 1002           H356             NA             NA
3 1004           D235             NA           I802
4 1005           B178             NA             NA
5 1008           C761             NA             NA
6 1011           J679           A045           D352
在本例中,它检查2:4列中的任何列是否包含任何给定代码

或:


在这种情况下,它会检查名称为disease_code的任何列是否包含任何给定的代码。

如@docendo discimus的评论中所述,我们可以使用gather、group_by ID将数据帧转换为长格式,并仅选择其中没有Decision_code的ID,然后将其传播回宽格式

library(tidyverse)

df %>%
   gather(key, value, -ID) %>%
   group_by(ID) %>%
   filter(!any(value %in% dementia_code)) %>%
   spread(key, value)

#   ID disease_code_1 disease_code_2 disease_code_3
#  <dbl> <chr>          <chr>          <chr>         
#1  1001 I802           A071           H250          
#2  1002 H356           NA             NA            
#3  1004 D235           NA             I802          
#4  1005 B178           NA             NA            
#5  1008 C761           NA             NA            
#6  1011 J679           A045           D352          
资料

如果您喜欢的话,一个带有基本R的for循环版本

df <- data.frame(ID = c(1001, 1002, 1003, 1004, 1005,1006,1007,1008,1009,1010,1011),
                disease_code_1 = c('I802','H356','G560','D235','B178','F011','F023','C761','H653','A049','J679'),
                disease_code_2 = c('A071','NA','G20','NA','NA','A049','NA','NA','G300','G308','A045'),
                disease_code_3 = c('H250','NA','NA','I802','NA','A481','NA','NA','NA','NA','D352'), stringsAsFactors = FALSE)

dementia_codes <- c("F023", "G20", "F009", "F002", "F001", "F000", "F00", "G309", "G308", "G301", "G300", "G30", "F01", "F018", "F013", "F012", "F011", "F010", "F01")

new_df <- df[0,]

for(i in 1:nrow(df)){
  currRow <- df[i,]
  if(any(dementia_codes %in% as.character(currRow)) == FALSE){
    new_df <- rbind(new_df, currRow)
  }
}

new_df
#      ID disease_code_1 disease_code_2 disease_code_3
# 1  1001           I802           A071           H250
# 2  1002           H356             NA             NA
# 4  1004           D235             NA           I802
# 5  1005           B178             NA             NA
# 8  1008           C761             NA             NA
# 11 1011           J679           A045           D352
我们可以使用data.table中的melt/dcast

或者,这可以在基底R中更紧凑地完成,而无需重塑

df[!Reduce(`|`, lapply(df[-1], `%in%` , dementia_codes)),]
 #   ID disease_code_1 disease_code_2 disease_code_3
#1  1001           I802           A071           H250
#2  1002           H356             NA             NA
#4  1004           D235             NA           I802
#5  1005           B178             NA             NA
#8  1008           C761             NA             NA
#11 1011           J679           A045           D352
数据
您应该将数据重塑为长格式。这将使你的生活和分析变得更加容易。记住CRAN软件包以保持你的理智。许多类似的问题得益于或需要应用共病图,icd非常仔细和迅速地使用经过验证、广泛引用的疾病图。这并不能回答您的问题,但使用这种技术可能会避免这个问题,这取决于您已经做了什么,以及您将如何处理数据。@Ronan Shah Nice!这是一个更优雅的解决方案。你应该发布它。为什么要加载所有的tidyverse?这不就是蒂蒂尔和迪普里吗?@Dunois是的,是的。我有一个默认加载的习惯:PWe也可以使用一个反连接,比如Newdata\u df%anti\u joindf%>%gatherDiseaseCodeNumber,CodeValue,-ID%>%filterCodeValue%in%cF023,G20,F009,F002,F001,F00,G309,G308,G301,G300,G30,F01,F018,F013,F012,F011,F010,F01,by=id谢谢大家的建议!我很感激你也解释了你建议的代码@tmfmnk的作用-真的很有用!
df %>%
 filter_at(vars(contains("disease_code")), all_vars(! . %in% c("F023","G20","F009","F002","F001","F000","F00",    
            "G309", "G308","G301","G300","G30", "F01","F018","F013",
            "F012", "F011", "F010","F01")))
library(tidyverse)

df %>%
   gather(key, value, -ID) %>%
   group_by(ID) %>%
   filter(!any(value %in% dementia_code)) %>%
   spread(key, value)

#   ID disease_code_1 disease_code_2 disease_code_3
#  <dbl> <chr>          <chr>          <chr>         
#1  1001 I802           A071           H250          
#2  1002 H356           NA             NA            
#3  1004 D235           NA             I802          
#4  1005 B178           NA             NA            
#5  1008 C761           NA             NA            
#6  1011 J679           A045           D352          
dementia_code <- c("F023", "G20", "F009", "F002", "F001", "F000", "F00", "G309", 
"G308","G301", "G300", "G30", "F01", "F018", "F013", "F012", "F011", "F010", "F01")
df <- data.frame(ID = c(1001, 1002, 1003, 1004, 1005,1006,1007,1008,1009,1010,1011),
                disease_code_1 = c('I802','H356','G560','D235','B178','F011','F023','C761','H653','A049','J679'),
                disease_code_2 = c('A071','NA','G20','NA','NA','A049','NA','NA','G300','G308','A045'),
                disease_code_3 = c('H250','NA','NA','I802','NA','A481','NA','NA','NA','NA','D352'), stringsAsFactors = FALSE)

dementia_codes <- c("F023", "G20", "F009", "F002", "F001", "F000", "F00", "G309", "G308", "G301", "G300", "G30", "F01", "F018", "F013", "F012", "F011", "F010", "F01")

new_df <- df[0,]

for(i in 1:nrow(df)){
  currRow <- df[i,]
  if(any(dementia_codes %in% as.character(currRow)) == FALSE){
    new_df <- rbind(new_df, currRow)
  }
}

new_df
#      ID disease_code_1 disease_code_2 disease_code_3
# 1  1001           I802           A071           H250
# 2  1002           H356             NA             NA
# 4  1004           D235             NA           I802
# 5  1005           B178             NA             NA
# 8  1008           C761             NA             NA
# 11 1011           J679           A045           D352
library(data.table)
dcast(melt(setDT(df), id.var = 'ID')[,
     if(!any(value %in% dementia_codes)) .SD, .(ID)], ID ~ variable)
#    ID disease_code_1 disease_code_2 disease_code_3
#1: 1001           I802           A071           H250
#2: 1002           H356             NA             NA
#3: 1004           D235             NA           I802
#4: 1005           B178             NA             NA
#5: 1008           C761             NA             NA
#6: 1011           J679           A045           D352
df[!Reduce(`|`, lapply(df[-1], `%in%` , dementia_codes)),]
 #   ID disease_code_1 disease_code_2 disease_code_3
#1  1001           I802           A071           H250
#2  1002           H356             NA             NA
#4  1004           D235             NA           I802
#5  1005           B178             NA             NA
#8  1008           C761             NA             NA
#11 1011           J679           A045           D352
dementia_codes <- c("F023", "G20", "F009", "F002", "F001", "F000", 
  "F00", "G309", "G308", "G301", "G300", "G30", "F01", "F018", "F013", 
   "F012", "F011", "F010", "F01")