R 基于来自不同数据帧的分组值更新数据帧
我有一个dataframe,它的列需要根据查找表进行更新。查找表基于设置的分组。如果未找到匹配项,则要更新的值将保留为空 这是我的输入数据:R 基于来自不同数据帧的分组值更新数据帧,r,data.table,dplyr,R,Data.table,Dplyr,我有一个dataframe,它的列需要根据查找表进行更新。查找表基于设置的分组。如果未找到匹配项,则要更新的值将保留为空 这是我的输入数据: dput(DF_Generated) structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234", "P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456", "P3456", NA, NA), SO_ID = c("S1",
dput(DF_Generated)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
NA), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2", "N2",
NA, "N3", NA, "N4", NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), .Names = c("PO_ID", "SO_ID", "F_Year",
"Product_ID", "Revenue", "Quantity", "Location1", "Name"))
dput(DF_Lookup_2)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P2345", "P2345", "P3456", NA), SO_ID = c("S1", "S1",
"S1", "S2", "S2", "S3", "S4", "S7", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2011, 2011, 2014, 2015), Location1 = c("MA",
"NY", "WA", "NY", "WA", "IL", "IL", "MN", "CA"), Name = c("N1",
"N1", "N1", "N1", "N1", "N2", "N2", "N3", "N4")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L), .Names = c("PO_ID",
"SO_ID", "F_Year", "Location1", "Name"))
dput(DFO)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
"CA"), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2",
"N2", NA, "N3", NA, "N4", "N4")), .Names = c("PO_ID", "SO_ID",
"F_Year", "Product_ID", "Revenue", "Quantity", "Location1", "Name"
), row.names = c(NA, 14L), class = "data.frame")
这是我的查找表:
dput(DF_Generated)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
NA), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2", "N2",
NA, "N3", NA, "N4", NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), .Names = c("PO_ID", "SO_ID", "F_Year",
"Product_ID", "Revenue", "Quantity", "Location1", "Name"))
dput(DF_Lookup_2)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P2345", "P2345", "P3456", NA), SO_ID = c("S1", "S1",
"S1", "S2", "S2", "S3", "S4", "S7", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2011, 2011, 2014, 2015), Location1 = c("MA",
"NY", "WA", "NY", "WA", "IL", "IL", "MN", "CA"), Name = c("N1",
"N1", "N1", "N1", "N1", "N2", "N2", "N3", "N4")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L), .Names = c("PO_ID",
"SO_ID", "F_Year", "Location1", "Name"))
dput(DFO)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
"CA"), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2",
"N2", NA, "N3", NA, "N4", "N4")), .Names = c("PO_ID", "SO_ID",
"F_Year", "Product_ID", "Revenue", "Quantity", "Location1", "Name"
), row.names = c(NA, 14L), class = "data.frame")
预期输出为:
dput(DF_Generated)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
NA), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2", "N2",
NA, "N3", NA, "N4", NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), .Names = c("PO_ID", "SO_ID", "F_Year",
"Product_ID", "Revenue", "Quantity", "Location1", "Name"))
dput(DF_Lookup_2)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P2345", "P2345", "P3456", NA), SO_ID = c("S1", "S1",
"S1", "S2", "S2", "S3", "S4", "S7", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2011, 2011, 2014, 2015), Location1 = c("MA",
"NY", "WA", "NY", "WA", "IL", "IL", "MN", "CA"), Name = c("N1",
"N1", "N1", "N1", "N1", "N2", "N2", "N3", "N4")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L), .Names = c("PO_ID",
"SO_ID", "F_Year", "Location1", "Name"))
dput(DFO)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
"CA"), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2",
"N2", NA, "N3", NA, "N4", "N4")), .Names = c("PO_ID", "SO_ID",
"F_Year", "Product_ID", "Revenue", "Quantity", "Location1", "Name"
), row.names = c(NA, 14L), class = "data.frame")
逻辑:
查找基于三列:PO\u ID
,SO\u ID
,F\u Year
。如果找到匹配项,则仅当数据帧需要更新时才会覆盖条目。例如,列Location1
和Name
的行不应被覆盖,因为它们的值已经存在于查找表中。但是,Location1
和Name
列中的行条目,例如PO_ID
=NA
,SO_ID
=S10
和F_Year
=2015
需要使用查找表中的有效值进行更新,分别为CA
和N4
。我尝试使用数据。表
,但我下面的代码覆盖了所有条目,这是不正确的
我读了线程,并试图这样做,但代码覆盖了我不需要查找的现有条目
这是我的密码:
data.table::setDT(DF_Generated)
data.table::setDT(DF_Lookup_2)
data.table::setkey(DF_Generated,PO_ID,SO_ID,F_Year)
data.table::setkey(DF_Lookup_2,PO_ID,SO_ID,F_Year)
DF_Generated[DF_Lookup_2,on=c("PO_ID","SO_ID","F_Year"),c("Location1","Name"):=list(i.Location1,i.Name)]
我有两个问题:
问题:1)我使用的是数据表
,因为我的实际数据很大。因此,我正在寻找基于data.table
的解决方案。如何修复我的数据表代码
问题2)如果建议这样做,我也愿意接受dplyr
然而,对于我的学习,如果你能在这两个解决方案上帮助我,我将不胜感激。我是一名初学者,仍在学习这两个软件包。我们可以在加入上的时创建两个新列,分别为'PO_ID'、'SO_ID'和'F_Year',然后使用set
setDT(DF_Generated)[setDT(DF_Lookup_2), c("Location1N", "NameN") := list(i.Location1,
i.Name),on = .(PO_ID, SO_ID, F_Year)]
nm1 <- c("Location1", "Name")
nm2 <- paste0(nm1, "N")
for(j in seq_along(nm1)){
set(DF_Generated, i = which(is.na(DF_Generated[[nm1[j]]])), j=nm1[j],
value = DF_Generated[[nm2[j]]][is.na(df1[[nm1[j]]])])
}
DF_Generated[, (nm2) := NULL][]
identical(setDT(DFO), DF_Generated)
#[1] TRUE
setDT(DF_生成)[setDT(DF_查找_2),c(“位置1n”,“名称”):=list(i.Location1,
i、 名称),on=(采购订单ID,销售订单ID,F年)]
nm1这样做的另一个方法是(根据条件)连接到DF_Lookup_2
,然后分配回DF_Generated
。这样做的另一个原因是,在和X[Y]
join中,生成的join的长度为Y
,因此,类型为DF\u Lookup\u 2[DF\u Generated]
的join将为我们提供所需的长度。然后,我们可以按原样将其放回DF_生成的
DF_Generated[is.na(Location1) | is.na(Name), c("Location1", "Name"):=
DF_Lookup_2[DF_Generated[is.na(Location1) | is.na(Name)],
.(Location1, Name), on = .(PO_ID, SO_ID, F_Year)]]
identical(DF_Generated, setDT(DFO))
## [1] TRUE
@阿克伦-谢谢你的帮助。如果运行上述代码,您将看到Location1
将全部WA
(与原始数据集DF_生成的
不同。请在Location1
和Name
列中查看预期输出。这有帮助吗?可能我没有听你的问题。非常感谢你的帮助。如果你不介意的话,我有两个后续问题:a)你认为dplyr
会更简单吗你认为这段代码的性能会好吗?我这样问是因为在我的原始数据集中,我有大约100万行,28列。我还没有在原始数据集上运行您的代码,因为定制上述代码需要时间。我是一个初学者,没有太多的线索。因此,我很感激您的想法。@watchtower在tidyverse
中有一个coalesce
函数,但是如果有重复的,那么左连接将不起作用,谢谢。非常恭敬地说,我不知道为什么会出错<代码>“data.table::set中的错误(生成的DF_,i=is.na(生成的DF_[[nm1[j]])),:分配给现有列“Location1”的RHS的长度为零,但不为NULL。如果要删除该列,请使用NULL。否则,RHS的长度必须大于0;例如,NA_integer_。如果试图将列类型更改为空列表列,则与所有列类型更改一样,提供完整长度的RHS向量,如vector('list',nrow(DT));即新专栏中的'plonk'。
知道为什么会发生这种情况吗?@watchtower这是基于您文章中的示例还是原始数据集。这是由于软件包版本吗?我使用的是data.table_1.10。0@watchtower如果在新版本中出现问题,请继续使用Davidernburg的代码