R:为dataframe中感兴趣的记录查找不相似的列
在所有记录中,我在一个名为test3333的数据框中有225条两列(parent_mol_chembl_id,parent_name)的重复记录(2859个观测值,4734列) 我想找出重复行中哪些列是不同的。 我这样做R:为dataframe中感兴趣的记录查找不相似的列,r,R,在所有记录中,我在一个名为test3333的数据框中有225条两列(parent_mol_chembl_id,parent_name)的重复记录(2859个观测值,4734列) 我想找出重复行中哪些列是不同的。 我这样做grep('CHEMBL1502',test3333[,1]),然后检查不同的列范围以识别差异 grep('CHEMBL1502',test3333[,1])的所需输出 grep('CHEMBL790',test3333[,1])的所需输出 我想找出特定父项\u mol\u ch
grep('CHEMBL1502',test3333[,1])
,然后检查不同的列范围以识别差异
grep('CHEMBL1502',test3333[,1])的所需输出
grep('CHEMBL790',test3333[,1])的所需输出
我想找出特定父项\u mol\u chembl\u id的数据帧中哪些列名不同?请告诉我检查不同列的语法是什么?我们可以按前两列进行拆分
,然后根据列中唯一
元素的长度对列进行筛选。我们删除只有一个唯一的元素的列
lapply(split(df1, df1[1:2], drop=TRUE), function(df)
Filter(function(x) length(unique(x))>1|!is.numeric(x), df))
#$CHEMBL790.DZOLE
# parent_mol_chembl_id parent_name D000152
#3 CHEMBL790 DZOLE 1
#4 CHEMBL790 DZOLE 0
#$CHEMBL1502.PANTOPRAZOLE
# parent_mol_chembl_id parent_name D000166
#1 CHEMBL1502 PANTOPRAZOLE 0
#2 CHEMBL1502 PANTOPRAZOLE 1
数据
df1我们可以按前两列进行拆分
,然后根据列中唯一
元素的长度对列进行筛选。我们删除只有一个唯一的元素的列
lapply(split(df1, df1[1:2], drop=TRUE), function(df)
Filter(function(x) length(unique(x))>1|!is.numeric(x), df))
#$CHEMBL790.DZOLE
# parent_mol_chembl_id parent_name D000152
#3 CHEMBL790 DZOLE 1
#4 CHEMBL790 DZOLE 0
#$CHEMBL1502.PANTOPRAZOLE
# parent_mol_chembl_id parent_name D000166
#1 CHEMBL1502 PANTOPRAZOLE 0
#2 CHEMBL1502 PANTOPRAZOLE 1
数据
df1我对重复的第一列ID和
mydata <- test3333
uniqueIds = as.character(mydata[duplicated(as.character(mydata$parent_mol_chembl_id)),1])#unique(as.character(mydata$parent_mol_chembl_id))
for(i in 1:length(uniqueIds))
{
print(uniqueIds[i])
testIds = uniqueIds[i]
index = which(testIds==mydata[,1])
tmp=mydata[index,]
countX = apply(tmp,2,function(x) length(unique(x[!is.na(x)])))
length(which(countX!=1))
table(countX)
indexDiff=which(countX>1)
if(length(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
print(t)
}else{
print(tmp[,indexDiff])
}
}
mydata 1)
if(长度(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
打印(t)
}否则{
打印(tmp[,indexDiff])
}
}
我对重复的第一列ID和
mydata <- test3333
uniqueIds = as.character(mydata[duplicated(as.character(mydata$parent_mol_chembl_id)),1])#unique(as.character(mydata$parent_mol_chembl_id))
for(i in 1:length(uniqueIds))
{
print(uniqueIds[i])
testIds = uniqueIds[i]
index = which(testIds==mydata[,1])
tmp=mydata[index,]
countX = apply(tmp,2,function(x) length(unique(x[!is.na(x)])))
length(which(countX!=1))
table(countX)
indexDiff=which(countX>1)
if(length(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
print(t)
}else{
print(tmp[,indexDiff])
}
}
mydata 1)
if(长度(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
打印(t)
}否则{
打印(tmp[,indexDiff])
}
}
df1 <- structure(list(parent_mol_chembl_id = c("CHEMBL1502", "CHEMBL1502",
"CHEMBL790", "CHEMBL790"), parent_name = c("PANTOPRAZOLE", "PANTOPRAZOLE",
"DZOLE", "DZOLE"), D000022 = c(0L, 0L, 0L, 0L), D000038 = c(0L,
0L, 0L, 0L), D000152 = c(0L, 0L, 1L, 0L), D000166 = c(0L, 1L,
1L, 1L)), .Names = c("parent_mol_chembl_id", "parent_name", "D000022",
"D000038", "D000152", "D000166"), class = "data.frame",
row.names = c(NA, -4L))
mydata <- test3333
uniqueIds = as.character(mydata[duplicated(as.character(mydata$parent_mol_chembl_id)),1])#unique(as.character(mydata$parent_mol_chembl_id))
for(i in 1:length(uniqueIds))
{
print(uniqueIds[i])
testIds = uniqueIds[i]
index = which(testIds==mydata[,1])
tmp=mydata[index,]
countX = apply(tmp,2,function(x) length(unique(x[!is.na(x)])))
length(which(countX!=1))
table(countX)
indexDiff=which(countX>1)
if(length(indexDiff)==1)
{
t=as.matrix(tmp[,indexDiff])
colnames(t)=colnames(tmp)[indexDiff]
print(t)
}else{
print(tmp[,indexDiff])
}
}