R 根据列对数据帧进行子集设置
我正在尝试从一个大的数据帧生成一个较小的数据帧,这取决于两个数据帧中“row.ID”列中的类似行 我一直在尝试应用R 根据列对数据帧进行子集设置,r,merge,dataframe,match,rows,R,Merge,Dataframe,Match,Rows,我正在尝试从一个大的数据帧生成一个较小的数据帧,这取决于两个数据帧中“row.ID”列中的类似行 我一直在尝试应用match()、subset()和merge(),但从未得到我需要的结果 下面是我的数据帧的样子 file1<- structure(list(row.ID = c(1, 22, 51, 31, 231, 21, 551, 13, 10, 11, 12, 83, 84, 86, 87, 89, 120, 91, 92, 311, 94, 187, 98), C
match()
、subset()
和merge()
,但从未得到我需要的结果
下面是我的数据帧的样子
file1<- structure(list(row.ID = c(1, 22, 51, 31, 231, 21, 551, 13, 10,
11, 12, 83, 84, 86, 87, 89, 120, 91, 92, 311, 94, 187, 98), Col0 = c(1,
2, 3, 4, 5, 6, 12, 13, 15, 16, 18, 126, 128, 131, 132, 135, 136,
137, 139, 140, 141, 143, 148), V1 = c(238.27, 294.39, 413.3,
853.24, 7.06, 8.987, 41.73, 39.232, 11.151, 13.472, 8.041, 8.057,
7.961, 7.43, 8.047, 334.54, 229.03, 265.36, 354.49, 151.25, 237.75,
901.24, 280.27), V2 = c(7.686, 6.846, 10.08, 6.666, 26.741, 29.358,
12.885, 11.982, 22.898, 12.75, 21.041, 28.87, 12.316, 19.778,
71.023, 11.151, 13.472, 8.041, 8.057, 7.961, 7.43, 8.047, 9.342
), V3 = c(19.063, 25.17, 29.626, 79.233, 38.952, 42.658, 13.015,
12.244, 30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207,
22.898, 12.75, 21.041, 28.87, 12.316, 19.778, 71.023, 21.963),
V4 = c(31.814, 43.349, 42.989, 125.904, 28.853, 30.392, 16.483,
16.335, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841,
30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207,
30.905), V5 = c(19.398, 26.443, 29.687, 85.433, 43.737, 46.906,
12.413, 12.409, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973,
124.988, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841,
21.904), V6 = c(35.325, 48.986, 45.76, 334.54, 0.75, 12,
241.34, 258.34, 282.4, 377.46, 30.392, 16.483, 0.648, 0.618,
0.634, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973, 124.988,
33.416), V7 = c(0.615, 294.39, 413.3, 1.001, 1.051, 17, 1.011,
0.985, 0.974, 1.016, 46.906, 12.413, 377.46, 500.76, 470.78,
334.54, 0.75, 0.638, 0.656, 0.648, 0.618, 0.634, 0.732),
V8 = c(1.026, 1.008, 1.049, 10, 21, 12, 227.31, 241.34, 258.34,
282.4, 377.46, 500.76, 1.016, 1.085, 1.02, 1.001, 1.051,
1.01, 1.001, 0.985, 0.994, 1.011, 1.03), V9 = c(0.626, 46.906,
12.413, 12.409, 32.337, 18.715, 17, 0.678, 0.664, 0.656,
0.723, 0.721, 0.724, 1.374, 1.361, 0.855, 0.765, 0.677, 0.698,
0.721, 0.669, 0.677, 0.73), V10 = c(1.14, 377.46, 500.76,
470.78, 334.54, 0.75, 12, 241.34, 258.34, 282.4, 377.46,
30.392, 16.483, 16.335, 25.648, 13, 0.648, 0.618, 0.634,
32.337, 18.715, 36.953, 49.575), V11 = c(31, 1.016, 1.085,
1.02, 1.001, 1.051, 17, 1.011, 0.985, 0.974, 1.016, 46.906,
12.413, 12.409, 32.337, 18.715, 377.46, 500.76, 470.78, 334.54,
0.75, 0.638, 0.656), V12 = c(17, 32, 30, 12, 10, 21, 12,
227.31, 241.34, 258.34, 282.4, 377.46, 500.76, 470.78, 334.54,
0.75, 1.016, 1.085, 1.02, 1.001, 1.051, 1.01, 1.001), V13 = c(31,
43, 43, 132, 21, 0.99, 1, 1.016, 1.011, 0.985, 0.974, 1.016,
1.085, 1.02, 1.001, 1.051, 0.724, 1.374, 1.361, 0.855, 0.765,
0.677, 0.698), V14 = c(17, 21, 31, 0.985, 0.974, 1.016, 1.085,
9, 16, 17, 23, 32, 30, 12, 10, 21, 16.483, 16.335, 25.648,
13, 1.101, 1.12, 1.127), V15 = c(9, 9, 25, 17, 23, 32, 30,
0, 8, 8, 21, 6, 21, 6, 6, 7, 12.413, 12.409, 32.337, 18.715,
17, 33, 44)), .Names = c("row.ID", "Col0", "V1", "V2", "V3",
"V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13",
"V14", "V15"), row.names = c(NA, 23L), class = "data.frame")
file2<- structure(list(row.ID = c(83, 94, 98), X = c(1077, 1171, 1205
), V1 = c(-1.278147106, -0.895961572, -1.491168551), NO.YES = structure(c(1L,
1L, 1L), .Label = "NO", class = "factor"), YES.NO = structure(c(1L,
1L, 1L), .Label = "YES", class = "factor"), P.Y = c(0.168275205,
0.264104166, 0.128155717), P.NO = c(0.831724795, 0.735895834,
0.871844283)), .Names = c("row.ID", "X", "V1", "NO.YES", "YES.NO",
"P.Y", "P.NO"), row.names = c(NA, 3L), class = "data.frame")
我主要是尝试merge(),比如
mylist <- merge(file1,file2,by="row.ID")
mylist第一个代码给出了文件1,其中row.ID中的行位于文件2的row.ID中的行。因此,当您使用merge
命令得到文件2时,它不会给您提供文件2的列。您可以对k2使用单独的类似代码,如下所示,以生成与row.ID中的行相匹配的文件2
k1<-file1[file1$row.ID %in% file2$row.ID,]
> k1
row.ID Col0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
12 83 126 8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000 6
21 94 141 237.750 7.430 19.778 31.822 21.409 35.973 0.618 0.994 0.669 18.715 0.750 1.051 0.765 1.101 17
23 98 148 280.270 9.342 21.963 30.905 21.904 33.416 0.732 1.030 0.730 49.575 0.656 1.001 0.698 1.127 44
k2<-file2[file2$row.ID %in% file1$row.ID,]
> k2
row.ID X V1 NO.YES YES.NO P.Y P.NO
1 83 1077 -1.2781471 NO YES 0.1682752 0.8317248
2 94 1171 -0.8959616 NO YES 0.2641042 0.7358958
3 98 1205 -1.4911686 NO YES 0.1281557 0.8718443
下面是@James Pringle使用data.tables的回答(在上面的评论中):
file1dt <- data.table(file1)
file2dt <- data.table(file2)
setkey(file1dt,row.ID)
file1dt[J(file2dt$row.ID)]
file1dt请解释“很多麻烦”的含义。该命令对我有效。似乎您只使用file2
根据row.ID选择要保留的行。所以,做一些R-magicmerge(file1,data.frame(row.ID=file2[,1]),by=“row.ID”)
(魔法是将数据帧的一列保留为数据帧)。哇,你用merge做到了!真有趣!我将坚持使用提供的简单命令@Metrics,但这对学习非常有帮助。谢谢mateoh,我从来都不理解这个操作符%in%,当我需要使用它时总是失败!谢谢:)更改列名和重复匹配行中两个文件的列是一个很大的麻烦,正如您在答案底部提到的注释。我的数据要与其他数据文件反复比较,我需要列名相同。您可以查看%
中%的用法。您可以使用Map
功能一次比较所有数据文件。这将很容易,但如果您将此作为另一个问题发布,您可能也会从其他人那里找到答案。我的观点是,您不需要一次又一次地运行此命令。我将尝试阅读有关该函数的内容,不知道它的作用:D Cheers:)
> merge(file1,file2,"row.ID")
row.ID Col0 V1.x V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 X V1.y NO.YES
1 83 126 8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000 6 1077 -1.2781471 NO
2 94 141 237.750 7.430 19.778 31.822 21.409 35.973 0.618 0.994 0.669 18.715 0.750 1.051 0.765 1.101 17 1171 -0.8959616 NO
3 98 148 280.270 9.342 21.963 30.905 21.904 33.416 0.732 1.030 0.730 49.575 0.656 1.001 0.698 1.127 44 1205 -1.4911686 NO
YES.NO P.Y P.NO
1 YES 0.1682752 0.8317248
2 YES 0.2641042 0.7358958
3 YES 0.1281557 0.8718443
file1dt <- data.table(file1)
file2dt <- data.table(file2)
setkey(file1dt,row.ID)
file1dt[J(file2dt$row.ID)]