R:为dataframe中感兴趣的记录查找不相似的列

R:为dataframe中感兴趣的记录查找不相似的列,r,R,在所有记录中,我在一个名为test3333的数据框中有225条两列(parent_mol_chembl_id,parent_name)的重复记录(2859个观测值,4734列) 我想找出重复行中哪些列是不同的。 我这样做grep('CHEMBL1502',test3333[,1]),然后检查不同的列范围以识别差异 grep('CHEMBL1502',test3333[,1])的所需输出 grep('CHEMBL790',test3333[,1])的所需输出 我想找出特定父项\u mol\u ch

在所有记录中,我在一个名为test3333的数据框中有225条两列(parent_mol_chembl_id,parent_name)的重复记录(2859个观测值,4734列)

我想找出重复行中哪些列是不同的。 我这样做
grep('CHEMBL1502',test3333[,1])
,然后检查不同的列范围以识别差异

grep('CHEMBL1502',test3333[,1])的所需输出

grep('CHEMBL790',test3333[,1])的所需输出


我想找出特定父项\u mol\u chembl\u id的数据帧中哪些列名不同?请告诉我检查不同列的语法是什么?

我们可以按前两列进行
拆分
,然后根据列中
唯一
元素的
长度对列进行
筛选。我们删除只有一个
唯一的
元素的列

lapply(split(df1, df1[1:2], drop=TRUE), function(df) 
         Filter(function(x) length(unique(x))>1|!is.numeric(x), df))
#$CHEMBL790.DZOLE
#  parent_mol_chembl_id parent_name D000152
#3            CHEMBL790       DZOLE       1
#4            CHEMBL790       DZOLE       0

#$CHEMBL1502.PANTOPRAZOLE
#   parent_mol_chembl_id  parent_name D000166
#1           CHEMBL1502 PANTOPRAZOLE       0
#2           CHEMBL1502 PANTOPRAZOLE       1
数据
df1我们可以按前两列进行
拆分
,然后根据列中
唯一
元素的
长度对列进行
筛选。我们删除只有一个
唯一的
元素的列

lapply(split(df1, df1[1:2], drop=TRUE), function(df) 
         Filter(function(x) length(unique(x))>1|!is.numeric(x), df))
#$CHEMBL790.DZOLE
#  parent_mol_chembl_id parent_name D000152
#3            CHEMBL790       DZOLE       1
#4            CHEMBL790       DZOLE       0

#$CHEMBL1502.PANTOPRAZOLE
#   parent_mol_chembl_id  parent_name D000166
#1           CHEMBL1502 PANTOPRAZOLE       0
#2           CHEMBL1502 PANTOPRAZOLE       1
数据
df1我对重复的第一列ID和

  mydata <- test3333  
    uniqueIds = as.character(mydata[duplicated(as.character(mydata$parent_mol_chembl_id)),1])#unique(as.character(mydata$parent_mol_chembl_id))
        for(i in 1:length(uniqueIds))
        {
          print(uniqueIds[i])
          testIds = uniqueIds[i]
          index = which(testIds==mydata[,1])
          tmp=mydata[index,]
          countX = apply(tmp,2,function(x) length(unique(x[!is.na(x)])))
          length(which(countX!=1))
          table(countX)
          indexDiff=which(countX>1)
          if(length(indexDiff)==1)
          {
            t=as.matrix(tmp[,indexDiff])
            colnames(t)=colnames(tmp)[indexDiff]
            print(t)
          }else{
            print(tmp[,indexDiff])
          }

        }
mydata 1)
if(长度(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
打印(t)
}否则{
打印(tmp[,indexDiff])
}
}

我对重复的第一列ID和

  mydata <- test3333  
    uniqueIds = as.character(mydata[duplicated(as.character(mydata$parent_mol_chembl_id)),1])#unique(as.character(mydata$parent_mol_chembl_id))
        for(i in 1:length(uniqueIds))
        {
          print(uniqueIds[i])
          testIds = uniqueIds[i]
          index = which(testIds==mydata[,1])
          tmp=mydata[index,]
          countX = apply(tmp,2,function(x) length(unique(x[!is.na(x)])))
          length(which(countX!=1))
          table(countX)
          indexDiff=which(countX>1)
          if(length(indexDiff)==1)
          {
            t=as.matrix(tmp[,indexDiff])
            colnames(t)=colnames(tmp)[indexDiff]
            print(t)
          }else{
            print(tmp[,indexDiff])
          }

        }
mydata 1)
if(长度(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
打印(t)
}否则{
打印(tmp[,indexDiff])
}
}
df1 <- structure(list(parent_mol_chembl_id = c("CHEMBL1502", "CHEMBL1502", 
"CHEMBL790", "CHEMBL790"), parent_name = c("PANTOPRAZOLE", "PANTOPRAZOLE", 
"DZOLE", "DZOLE"), D000022 = c(0L, 0L, 0L, 0L), D000038 = c(0L, 
0L, 0L, 0L), D000152 = c(0L, 0L, 1L, 0L), D000166 = c(0L, 1L, 
1L, 1L)), .Names = c("parent_mol_chembl_id", "parent_name", "D000022", 
"D000038", "D000152", "D000166"), class = "data.frame",
 row.names = c(NA, -4L))
  mydata <- test3333  
    uniqueIds = as.character(mydata[duplicated(as.character(mydata$parent_mol_chembl_id)),1])#unique(as.character(mydata$parent_mol_chembl_id))
        for(i in 1:length(uniqueIds))
        {
          print(uniqueIds[i])
          testIds = uniqueIds[i]
          index = which(testIds==mydata[,1])
          tmp=mydata[index,]
          countX = apply(tmp,2,function(x) length(unique(x[!is.na(x)])))
          length(which(countX!=1))
          table(countX)
          indexDiff=which(countX>1)
          if(length(indexDiff)==1)
          {
            t=as.matrix(tmp[,indexDiff])
            colnames(t)=colnames(tmp)[indexDiff]
            print(t)
          }else{
            print(tmp[,indexDiff])
          }

        }