为r中的新列从另一个数据框查询数据
好的,我对R中的数据帧管理和操作是新手,在查询和向数据帧添加信息以进行分析时遇到困难。我有3个数据帧 df1 df2 df3为r中的新列从另一个数据框查询数据,r,database,merge,R,Database,Merge,好的,我对R中的数据帧管理和操作是新手,在查询和向数据帧添加信息以进行分析时遇到困难。我有3个数据帧 df1 df2 df3 我想向df1添加如下列:使用County与df2匹配,并添加适当的区域。然后使用ResZip与df3(zip)匹配,以获得适当的纬度和经度。我知道我需要在需要的地方更改列名,但不确定如何执行这些查询。我习惯于访问,但无法转换为R。我尝试过合并,但获得的附加信息太多。非常感谢您的帮助。除了您提供的数据之外,这将起作用。df2和df3之间没有匹配的FIP。如果您了解SQL,那
我想向df1添加如下列:使用County与df2匹配,并添加适当的区域。然后使用ResZip与df3(zip)匹配,以获得适当的纬度和经度。我知道我需要在需要的地方更改列名,但不确定如何执行这些查询。我习惯于访问,但无法转换为R。我尝试过合并,但获得的附加信息太多。非常感谢您的帮助。除了您提供的数据之外,这将起作用。df2和df3之间没有匹配的FIP。如果您了解SQL,那么您可以进入access中的SQL视图并进行转换,然后在R中使用sqldf。接下来,如果您使用dput函数并复制该输出,人们可以更容易地帮助您
library(sqldf)
df1 = structure(list(Year = c(2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L), HNo = c(218050003L,
218050003L, 218050008L, 218050008L, 218050008L, 218050008L, 218050008L,
218050008L, 218050008L, 218050008L, 218050008L, 218050008L),
Month = c(10L, 10L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L,
11L), Day = c(8L, 10L, 1L, 10L, 12L, 16L, 23L, 28L, 12L,
18L, 22L, 11L), County = structure(c(1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("MIDDLESEX", "NEWLONDON"
), class = "factor"), ST = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CT", class = "factor"),
ResState = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "CT", class = "factor"), ResZIP = c(6037L,
6037L, 6355L, 6355L, 6355L, 6355L, 6355L, 6355L, 6355L, 6355L,
6355L, 6355L)), .Names = c("Year", "HNo", "Month", "Day",
"County", "ST", "ResState", "ResZIP"), class = "data.frame", row.names = c(NA,
-12L))
df2 = structure(list(FID = c(590L, 591L, 593L, 594L, 642L, 647L, 651L,
652L), County = structure(c(3L, 2L, 7L, 8L, 6L, 1L, 4L, 5L), .Label = c("Fairfield",
"Hartford", "Litchfield", "Middlesex", "NewHaven", "NewLondon",
"Tolland", "Windham"), class = "factor"), STATE_NAME = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Connecticut", class = "factor"),
STATE_FIPS = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), CNTY_FIPS = c(5L,
3L, 13L, 15L, 11L, 1L, 7L, 9L), FIPS = c(9005L, 9003L, 9013L,
9015L, 9011L, 9001L, 9007L, 9009L), AREA = c(929.3449, 742.8998,
411.904, 522.058, 706.352, 661.2935, 379.362, 623.3514)), .Names = c("FID",
"County", "STATE_NAME", "STATE_FIPS", "CNTY_FIPS", "FIPS", "AREA"
), class = "data.frame", row.names = c(NA, -8L))
df3 = structure(list(zip = c(6001L, 6002L, 6006L, 6010L, 6011L, 6030L,
6034L, 6045L, 6049L), city = structure(c(1L, 2L, 7L, 3L, 3L,
4L, 4L, 5L, 6L), .Label = c("Avon", "Bloomfield", "Bristol",
"Farmington", "Manchester", "Melrose", "Windsor"), class = "factor"),
state = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CT", class = "factor"),
latitude = c(41.7897, 41.8328, 41.87964, 41.68225, 41.79178,
41.79178, 41.79178, 41.79178, 41.79178), longitude = c(-72.86431,
-72.72642, -72.73427, -72.93365, -72.71883, -72.71883, -72.71883,
-72.71883, -72.71883), fips = c(9003L, 9003L, 9003L, 9003L,
9003L, 9003L, 9003L, 9003L, 9003L)), .Names = c("zip", "city",
"state", "latitude", "longitude", "fips"), class = "data.frame", row.names = c(NA,
-9L))
sqldf("Select tbl1.*, tbl2.Area,tbl3.latitude,tbl3.longitude
from df1 tbl1
inner join df2 tbl2 on upper(tbl1.County) = upper(tbl2.County)
inner join df3 tbl3 on tbl2.FIPS = tbl3.fips")
这将起作用,除非在您提供的数据中,df2和df3之间没有匹配的FIP。如果您了解SQL,那么您可以进入access中的SQL视图并进行转换,然后在R中使用sqldf。接下来,如果您使用dput函数并复制该输出,人们可以更容易地帮助您
library(sqldf)
df1 = structure(list(Year = c(2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L), HNo = c(218050003L,
218050003L, 218050008L, 218050008L, 218050008L, 218050008L, 218050008L,
218050008L, 218050008L, 218050008L, 218050008L, 218050008L),
Month = c(10L, 10L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L,
11L), Day = c(8L, 10L, 1L, 10L, 12L, 16L, 23L, 28L, 12L,
18L, 22L, 11L), County = structure(c(1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("MIDDLESEX", "NEWLONDON"
), class = "factor"), ST = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CT", class = "factor"),
ResState = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "CT", class = "factor"), ResZIP = c(6037L,
6037L, 6355L, 6355L, 6355L, 6355L, 6355L, 6355L, 6355L, 6355L,
6355L, 6355L)), .Names = c("Year", "HNo", "Month", "Day",
"County", "ST", "ResState", "ResZIP"), class = "data.frame", row.names = c(NA,
-12L))
df2 = structure(list(FID = c(590L, 591L, 593L, 594L, 642L, 647L, 651L,
652L), County = structure(c(3L, 2L, 7L, 8L, 6L, 1L, 4L, 5L), .Label = c("Fairfield",
"Hartford", "Litchfield", "Middlesex", "NewHaven", "NewLondon",
"Tolland", "Windham"), class = "factor"), STATE_NAME = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Connecticut", class = "factor"),
STATE_FIPS = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), CNTY_FIPS = c(5L,
3L, 13L, 15L, 11L, 1L, 7L, 9L), FIPS = c(9005L, 9003L, 9013L,
9015L, 9011L, 9001L, 9007L, 9009L), AREA = c(929.3449, 742.8998,
411.904, 522.058, 706.352, 661.2935, 379.362, 623.3514)), .Names = c("FID",
"County", "STATE_NAME", "STATE_FIPS", "CNTY_FIPS", "FIPS", "AREA"
), class = "data.frame", row.names = c(NA, -8L))
df3 = structure(list(zip = c(6001L, 6002L, 6006L, 6010L, 6011L, 6030L,
6034L, 6045L, 6049L), city = structure(c(1L, 2L, 7L, 3L, 3L,
4L, 4L, 5L, 6L), .Label = c("Avon", "Bloomfield", "Bristol",
"Farmington", "Manchester", "Melrose", "Windsor"), class = "factor"),
state = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CT", class = "factor"),
latitude = c(41.7897, 41.8328, 41.87964, 41.68225, 41.79178,
41.79178, 41.79178, 41.79178, 41.79178), longitude = c(-72.86431,
-72.72642, -72.73427, -72.93365, -72.71883, -72.71883, -72.71883,
-72.71883, -72.71883), fips = c(9003L, 9003L, 9003L, 9003L,
9003L, 9003L, 9003L, 9003L, 9003L)), .Names = c("zip", "city",
"state", "latitude", "longitude", "fips"), class = "data.frame", row.names = c(NA,
-9L))
sqldf("Select tbl1.*, tbl2.Area,tbl3.latitude,tbl3.longitude
from df1 tbl1
inner join df2 tbl2 on upper(tbl1.County) = upper(tbl2.County)
inner join df3 tbl3 on tbl2.FIPS = tbl3.fips")
您可以试试
dplyr
。动词
(实际上是函数)很容易理解和理解。也就是说,提供的样本数据帧似乎没有匹配
library(dplyr)
res1 = left_join(df1, df2 %>% select(County, AREA),
by=c("County"="County"))
res2 = left_join(res1, df3 %>% select(zip, latitude, longitude),
by=c("ResZIP"="zip"))
res2
# Year HNo Month Day County ST ResState ResZIP AREA latitude longitude
# 1 2005 218050003 10 8 MIDDLESEX CT CT 6037 NA NA NA
# 2 2005 218050003 10 10 MIDDLESEX CT CT 6037 NA NA NA
# 3 2005 218050008 9 1 NEWLONDON CT CT 6355 NA NA NA
# 4 2005 218050008 9 10 NEWLONDON CT CT 6355 NA NA NA
# 5 2005 218050008 9 12 NEWLONDON CT CT 6355 NA NA NA
# 6 2005 218050008 9 16 NEWLONDON CT CT 6355 NA NA NA
# 7 2005 218050008 9 23 NEWLONDON CT CT 6355 NA NA NA
# 8 2005 218050008 9 28 NEWLONDON CT CT 6355 NA NA NA
# 9 2005 218050008 10 12 NEWLONDON CT CT 6355 NA NA NA
# 10 2005 218050008 10 18 NEWLONDON CT CT 6355 NA NA NA
# 11 2005 218050008 10 22 NEWLONDON CT CT 6355 NA NA NA
# 12 2005 218050008 11 11 NEWLONDON CT CT 6355 NA NA NA
您可以试试
dplyr
。动词
(实际上是函数)很容易理解和理解。也就是说,提供的样本数据帧似乎没有匹配
library(dplyr)
res1 = left_join(df1, df2 %>% select(County, AREA),
by=c("County"="County"))
res2 = left_join(res1, df3 %>% select(zip, latitude, longitude),
by=c("ResZIP"="zip"))
res2
# Year HNo Month Day County ST ResState ResZIP AREA latitude longitude
# 1 2005 218050003 10 8 MIDDLESEX CT CT 6037 NA NA NA
# 2 2005 218050003 10 10 MIDDLESEX CT CT 6037 NA NA NA
# 3 2005 218050008 9 1 NEWLONDON CT CT 6355 NA NA NA
# 4 2005 218050008 9 10 NEWLONDON CT CT 6355 NA NA NA
# 5 2005 218050008 9 12 NEWLONDON CT CT 6355 NA NA NA
# 6 2005 218050008 9 16 NEWLONDON CT CT 6355 NA NA NA
# 7 2005 218050008 9 23 NEWLONDON CT CT 6355 NA NA NA
# 8 2005 218050008 9 28 NEWLONDON CT CT 6355 NA NA NA
# 9 2005 218050008 10 12 NEWLONDON CT CT 6355 NA NA NA
# 10 2005 218050008 10 18 NEWLONDON CT CT 6355 NA NA NA
# 11 2005 218050008 10 22 NEWLONDON CT CT 6355 NA NA NA
# 12 2005 218050008 11 11 NEWLONDON CT CT 6355 NA NA NA
太好了,谢谢。有办法在两列上匹配吗?是的,有。建议将其作为单独的问题与代表性的样本数据一起发布。太好了,谢谢。有办法在两列上匹配吗?是的,有。但建议将其作为单独的问题发布,并提供具有代表性的样本数据。
library(dplyr)
res1 = left_join(df1, df2 %>% select(County, AREA),
by=c("County"="County"))
res2 = left_join(res1, df3 %>% select(zip, latitude, longitude),
by=c("ResZIP"="zip"))
res2
# Year HNo Month Day County ST ResState ResZIP AREA latitude longitude
# 1 2005 218050003 10 8 MIDDLESEX CT CT 6037 NA NA NA
# 2 2005 218050003 10 10 MIDDLESEX CT CT 6037 NA NA NA
# 3 2005 218050008 9 1 NEWLONDON CT CT 6355 NA NA NA
# 4 2005 218050008 9 10 NEWLONDON CT CT 6355 NA NA NA
# 5 2005 218050008 9 12 NEWLONDON CT CT 6355 NA NA NA
# 6 2005 218050008 9 16 NEWLONDON CT CT 6355 NA NA NA
# 7 2005 218050008 9 23 NEWLONDON CT CT 6355 NA NA NA
# 8 2005 218050008 9 28 NEWLONDON CT CT 6355 NA NA NA
# 9 2005 218050008 10 12 NEWLONDON CT CT 6355 NA NA NA
# 10 2005 218050008 10 18 NEWLONDON CT CT 6355 NA NA NA
# 11 2005 218050008 10 22 NEWLONDON CT CT 6355 NA NA NA
# 12 2005 218050008 11 11 NEWLONDON CT CT 6355 NA NA NA