R 使用查找表向数据帧添加多列_R

R 使用查找表向数据帧添加多列

R 使用查找表向数据帧添加多列,r,R,我有一个数据表，希望使用查找表对其进行修改。我想循环数据中的代码列，并根据datayear列匹配和代码列名称，为每个列添加一个新的对应value列，该列与查找表的字段列中的正确行值相匹配我已经尝试使用lappy和left_join，但我无法确定如何使用数据列名称引用查找的字段中的右侧值。我还考虑了查找表在宽格式中是否更好，这样至少可以有匹配的列名，但我仍然无法生成可行的函数示例数据和所需输出：数据（编辑：实际数据将包含更多的代码列）：查找表： structure(list(datayea

我有一个数据表，希望使用查找表对其进行修改。我想循环数据中的代码列，并根据

datayear

列匹配和代码列名称，为每个列添加一个新的对应

value

列，该列与查找表的

字段

列中的正确行值相匹配

我已经尝试使用lappy和left_join，但我无法确定如何使用数据列名称引用查找的

字段中的右侧值。我还考虑了查找表在宽格式中是否更好，这样至少可以有匹配的列名，但我仍然无法生成可行的函数
示例数据和所需输出：
数据（编辑：实际数据将包含更多的代码列）：
查找表：
structure(list(datayear = c(2007L, 2007L, 2007L, 2007L, 2007L, 
2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 
2008L, 2008L, 2008L, 2008L, 2008L, 2008L), field = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("nationalitycode", "subjectcode"), class = "factor"), 
    code = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 
    3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), lookupvalue = structure(c(10L, 
    16L, 9L, 4L, 5L, 2L, 7L, 13L, 1L, 14L, 5L, 16L, 4L, 6L, 11L, 
    17L, 3L, 15L, 8L, 12L), .Label = c("Algebra", "Art", "Beekeeping", 
    "Chinese", "English", "French", "Geography", "H.E.", "Indian", 
    "Irish", "Italian", "Latin", "Maths", "P.E.", "Rivetting", 
    "Scottish", "Sewing"), class = "factor")), class = "data.frame", row.names = c(NA, 
-20L), .Names = c("datayear", "field", "code", "lookupvalue"))

   datayear           field code lookupvalue
1      2007 nationalitycode    1       Irish
2      2007 nationalitycode    2    Scottish
3      2007 nationalitycode    3      Indian
4      2007 nationalitycode    4     Chinese
5      2007 nationalitycode    5     English
6      2007     subjectcode    1         Art
7      2007     subjectcode    2   Geography
8      2007     subjectcode    3       Maths
9      2007     subjectcode    4     Algebra
10     2007     subjectcode    5        P.E.
11     2008 nationalitycode    1     English
12     2008 nationalitycode    2    Scottish
13     2008 nationalitycode    3     Chinese
14     2008 nationalitycode    4      French
15     2008 nationalitycode    5     Italian
16     2008     subjectcode    1      Sewing
17     2008     subjectcode    2  Beekeeping
18     2008     subjectcode    3   Rivetting
19     2008     subjectcode    4        H.E.
20     2008     subjectcode    5       Latin

期望输出：
   id datayear nationalitycode subjectcode nationalityvalue subjectvalue
1   1     2007               1           2            Irish    Geography
2   2     2007               1           5            Irish         P.E.
3   3     2007               1           5            Irish         P.E.
4   4     2007               2           5         Scottish         P.E.
5   5     2007               3           2           Indian    Geography
6   6     2008               5           5          Italian        Latin
7   7     2008               4           4           French         H.E.
8   8     2008               3           2          Chinese   Beekeeping
9   9     2008               2           1         Scottish       Sewing
10 10     2008               1           4          English         H.E.

非常感谢您的帮助
 诀窍是基于查找表的适当子集进行连接。也就是说，使用右字段值进行子集划分
library(dplyr)

dt1 = structure(list(id = 1:10, datayear = c(2007L, 2007L, 2007L, 2007L, 
2007L, 2008L, 2008L, 2008L, 2008L, 2008L), nationalitycode = c(1L, 
1L, 1L, 2L, 3L, 5L, 4L, 3L, 2L, 1L), subjectcode = c(2L, 5L, 
5L, 5L, 2L, 5L, 4L, 2L, 1L, 4L)), .Names = c("id", "datayear", 
"nationalitycode", "subjectcode"), class = "data.frame", row.names = c(NA, -10L))


dt2 = structure(list(datayear = c(2007L, 2007L, 2007L, 2007L, 2007L, 
2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 
2008L, 2008L, 2008L, 2008L, 2008L, 2008L), field = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("nationalitycode", "subjectcode"), class = "factor"), 
code = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), lookupvalue = structure(c(10L, 
16L, 9L, 4L, 5L, 2L, 7L, 13L, 1L, 14L, 5L, 16L, 4L, 6L, 11L, 
17L, 3L, 15L, 8L, 12L), .Label = c("Algebra", "Art", "Beekeeping", 
"Chinese", "English", "French", "Geography", "H.E.", "Indian", 
"Irish", "Italian", "Latin", "Maths", "P.E.", "Rivetting", 
"Scottish", "Sewing"), class = "factor")), class = "data.frame", row.names = c(NA, 
-20L), .Names = c("datayear", "field", "code", "lookupvalue"))


dt1 %>%
  left_join(dt2 %>% filter(field == "nationalitycode"), by=c("datayear"="datayear","nationalitycode"="code")) %>%
  left_join(dt2 %>% filter(field == "subjectcode"), by=c("datayear"="datayear","subjectcode"="code")) %>%
  rename(nationalityvalue = lookupvalue.x,
         subjectvalue = lookupvalue.y) %>%
  select(-field.x, -field.y)

#    id datayear nationalitycode subjectcode nationalityvalue subjectvalue
# 1   1     2007               1           2            Irish    Geography
# 2   2     2007               1           5            Irish         P.E.
# 3   3     2007               1           5            Irish         P.E.
# 4   4     2007               2           5         Scottish         P.E.
# 5   5     2007               3           2           Indian    Geography
# 6   6     2008               5           5          Italian        Latin
# 7   7     2008               4           4           French         H.E.
# 8   8     2008               3           2          Chinese   Beekeeping
# 9   9     2008               2           1         Scottish       Sewing
# 10 10     2008               1           4          English         H.E.

对于使用循环要求的更一般的情况，我需要重新调整查找表的形状，以便使用列名。该过程将自动检测查找表中有多少唯一字段，并使用for循环（顺序）执行连接
library(dplyr)
library(tidyr)

dt1 = structure(list(id = 1:10, datayear = c(2007L, 2007L, 2007L, 2007L, 
2007L, 2008L, 2008L, 2008L, 2008L, 2008L), nationalitycode = c(1L, 
1L, 1L, 2L, 3L, 5L, 4L, 3L, 2L, 1L), subjectcode = c(2L, 5L, 
5L, 5L, 2L, 5L, 4L, 2L, 1L, 4L)), .Names = c("id", "datayear", 
"nationalitycode", "subjectcode"), class = "data.frame", row.names = c(NA, -10L))


dt2 = structure(list(datayear = c(2007L, 2007L, 2007L, 2007L, 2007L, 
2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 
2008L, 2008L, 2008L, 2008L, 2008L, 2008L), field = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("nationalitycode", "subjectcode"), class = "factor"), 
code = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), lookupvalue = structure(c(10L, 
16L, 9L, 4L, 5L, 2L, 7L, 13L, 1L, 14L, 5L, 16L, 4L, 6L, 11L, 
17L, 3L, 15L, 8L, 12L), .Label = c("Algebra", "Art", "Beekeeping", 
"Chinese", "English", "French", "Geography", "H.E.", "Indian", 
"Irish", "Italian", "Latin", "Maths", "P.E.", "Rivetting", 
"Scottish", "Sewing"), class = "factor")), class = "data.frame", row.names = c(NA, 
-20L), .Names = c("datayear", "field", "code", "lookupvalue"))


# reshape your lookup data
dt2 %>%
  spread(field, code) -> dt2_reshaped

# start dataset (to join every field you have)
dt_temp = dt1

# for every field you have do the join
for (fld in as.character(unique(dt2$field))) {

  dt_temp %>% left_join(dt2_reshaped %>% select_("datayear", "lookupvalue", fld), by=c("datayear",fld)) -> dt_temp
  names(dt_temp)[names(dt_temp) == "lookupvalue" ] = gsub("code","value",fld)

}


dt_temp

#    id datayear nationalitycode subjectcode nationalityvalue subjectvalue
# 1   1     2007               1           2            Irish    Geography
# 2   2     2007               1           5            Irish         P.E.
# 3   3     2007               1           5            Irish         P.E.
# 4   4     2007               2           5         Scottish         P.E.
# 5   5     2007               3           2           Indian    Geography
# 6   6     2008               5           5          Italian        Latin
# 7   7     2008               4           4           French         H.E.
# 8   8     2008               3           2          Chinese   Beekeeping
# 9   9     2008               2           1         Scottish       Sewing
# 10 10     2008               1           4          English         H.E.

诀窍是基于查找表的适当子集进行连接。也就是说，使用右字段值进行子集划分
library(dplyr)

dt1 = structure(list(id = 1:10, datayear = c(2007L, 2007L, 2007L, 2007L, 
2007L, 2008L, 2008L, 2008L, 2008L, 2008L), nationalitycode = c(1L, 
1L, 1L, 2L, 3L, 5L, 4L, 3L, 2L, 1L), subjectcode = c(2L, 5L, 
5L, 5L, 2L, 5L, 4L, 2L, 1L, 4L)), .Names = c("id", "datayear", 
"nationalitycode", "subjectcode"), class = "data.frame", row.names = c(NA, -10L))


dt2 = structure(list(datayear = c(2007L, 2007L, 2007L, 2007L, 2007L, 
2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 
2008L, 2008L, 2008L, 2008L, 2008L, 2008L), field = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("nationalitycode", "subjectcode"), class = "factor"), 
code = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), lookupvalue = structure(c(10L, 
16L, 9L, 4L, 5L, 2L, 7L, 13L, 1L, 14L, 5L, 16L, 4L, 6L, 11L, 
17L, 3L, 15L, 8L, 12L), .Label = c("Algebra", "Art", "Beekeeping", 
"Chinese", "English", "French", "Geography", "H.E.", "Indian", 
"Irish", "Italian", "Latin", "Maths", "P.E.", "Rivetting", 
"Scottish", "Sewing"), class = "factor")), class = "data.frame", row.names = c(NA, 
-20L), .Names = c("datayear", "field", "code", "lookupvalue"))


dt1 %>%
  left_join(dt2 %>% filter(field == "nationalitycode"), by=c("datayear"="datayear","nationalitycode"="code")) %>%
  left_join(dt2 %>% filter(field == "subjectcode"), by=c("datayear"="datayear","subjectcode"="code")) %>%
  rename(nationalityvalue = lookupvalue.x,
         subjectvalue = lookupvalue.y) %>%
  select(-field.x, -field.y)

#    id datayear nationalitycode subjectcode nationalityvalue subjectvalue
# 1   1     2007               1           2            Irish    Geography
# 2   2     2007               1           5            Irish         P.E.
# 3   3     2007               1           5            Irish         P.E.
# 4   4     2007               2           5         Scottish         P.E.
# 5   5     2007               3           2           Indian    Geography
# 6   6     2008               5           5          Italian        Latin
# 7   7     2008               4           4           French         H.E.
# 8   8     2008               3           2          Chinese   Beekeeping
# 9   9     2008               2           1         Scottish       Sewing
# 10 10     2008               1           4          English         H.E.

对于使用循环要求的更一般的情况，我需要重新调整查找表的形状，以便使用列名。该过程将自动检测查找表中有多少唯一字段，并使用for循环（顺序）执行连接
library(dplyr)
library(tidyr)

dt1 = structure(list(id = 1:10, datayear = c(2007L, 2007L, 2007L, 2007L, 
2007L, 2008L, 2008L, 2008L, 2008L, 2008L), nationalitycode = c(1L, 
1L, 1L, 2L, 3L, 5L, 4L, 3L, 2L, 1L), subjectcode = c(2L, 5L, 
5L, 5L, 2L, 5L, 4L, 2L, 1L, 4L)), .Names = c("id", "datayear", 
"nationalitycode", "subjectcode"), class = "data.frame", row.names = c(NA, -10L))


dt2 = structure(list(datayear = c(2007L, 2007L, 2007L, 2007L, 2007L, 
2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 
2008L, 2008L, 2008L, 2008L, 2008L, 2008L), field = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("nationalitycode", "subjectcode"), class = "factor"), 
code = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), lookupvalue = structure(c(10L, 
16L, 9L, 4L, 5L, 2L, 7L, 13L, 1L, 14L, 5L, 16L, 4L, 6L, 11L, 
17L, 3L, 15L, 8L, 12L), .Label = c("Algebra", "Art", "Beekeeping", 
"Chinese", "English", "French", "Geography", "H.E.", "Indian", 
"Irish", "Italian", "Latin", "Maths", "P.E.", "Rivetting", 
"Scottish", "Sewing"), class = "factor")), class = "data.frame", row.names = c(NA, 
-20L), .Names = c("datayear", "field", "code", "lookupvalue"))


# reshape your lookup data
dt2 %>%
  spread(field, code) -> dt2_reshaped

# start dataset (to join every field you have)
dt_temp = dt1

# for every field you have do the join
for (fld in as.character(unique(dt2$field))) {

  dt_temp %>% left_join(dt2_reshaped %>% select_("datayear", "lookupvalue", fld), by=c("datayear",fld)) -> dt_temp
  names(dt_temp)[names(dt_temp) == "lookupvalue" ] = gsub("code","value",fld)

}


dt_temp

#    id datayear nationalitycode subjectcode nationalityvalue subjectvalue
# 1   1     2007               1           2            Irish    Geography
# 2   2     2007               1           5            Irish         P.E.
# 3   3     2007               1           5            Irish         P.E.
# 4   4     2007               2           5         Scottish         P.E.
# 5   5     2007               3           2           Indian    Geography
# 6   6     2008               5           5          Italian        Latin
# 7   7     2008               4           4           French         H.E.
# 8   8     2008               3           2          Chinese   Beekeeping
# 9   9     2008               2           1         Scottish       Sewing
# 10 10     2008               1           4          English         H.E.

如果X
是第一个data.frame
和LU
是第二个，data.table
和merge
使这一点变得简单明了
library(data.table)

# Convert the data.frames into data.tables
setDT(X)
setDT(LU)

# Join the tables on datayear and the appropriate code, for the 
# nationality data only.
X1 <- merge(X, LU[field == "nationalitycode"],
            by.x=c("datayear", "nationalitycode"),
            by.y=c("datayear", "code"))

# Now join the resulting table by subjectcode. 
X2 <- merge(X1, LU[field == "subjectcode"],
            by.x=c("datayear", "subjectcode"),
            by.y=c("datayear", "code"))

# Now subset the data.table to the columns you want, set the key 
# (order) by id, and rename some columns.
M <- X2[, c("id", "datayear", "nationalitycode", "subjectcode",
            "lookupvalue.x", "lookupvalue.y"), with=FALSE]
setkey(M, "id")
setnames(M, c("lookupvalue.x", "lookupvalue.y"),
         c("nationalityvalue", "subjectvalue"))

M
#     id datayear nationalitycode subjectcode nationalityvalue subjectvalue
#  1:  1     2007               1           2            Irish    Geography
#  2:  2     2007               1           5            Irish         P.E.
#  3:  3     2007               1           5            Irish         P.E.
#  4:  4     2007               2           5         Scottish         P.E.
#  5:  5     2007               3           2           Indian    Geography
#  6:  6     2008               5           5          Italian        Latin
#  7:  7     2008               4           4           French         H.E.
#  8:  8     2008               3           2          Chinese   Beekeeping
#  9:  9     2008               2           1         Scottish       Sewing
# 10: 10     2008               1           4          English         H.E.

库（data.table）
#将data.frames转换为data.tables
setDT（X）
setDT（LU）
#连接datayear上的表和相应的代码，用于
#仅国籍数据。
X1如果X
是您的第一个数据。框架和LU
是您的第二个，数据。表格和合并使这一点变得简单明了
library(data.table)

# Convert the data.frames into data.tables
setDT(X)
setDT(LU)

# Join the tables on datayear and the appropriate code, for the 
# nationality data only.
X1 <- merge(X, LU[field == "nationalitycode"],
            by.x=c("datayear", "nationalitycode"),
            by.y=c("datayear", "code"))

# Now join the resulting table by subjectcode. 
X2 <- merge(X1, LU[field == "subjectcode"],
            by.x=c("datayear", "subjectcode"),
            by.y=c("datayear", "code"))

# Now subset the data.table to the columns you want, set the key 
# (order) by id, and rename some columns.
M <- X2[, c("id", "datayear", "nationalitycode", "subjectcode",
            "lookupvalue.x", "lookupvalue.y"), with=FALSE]
setkey(M, "id")
setnames(M, c("lookupvalue.x", "lookupvalue.y"),
         c("nationalityvalue", "subjectvalue"))

M
#     id datayear nationalitycode subjectcode nationalityvalue subjectvalue
#  1:  1     2007               1           2            Irish    Geography
#  2:  2     2007               1           5            Irish         P.E.
#  3:  3     2007               1           5            Irish         P.E.
#  4:  4     2007               2           5         Scottish         P.E.
#  5:  5     2007               3           2           Indian    Geography
#  6:  6     2008               5           5          Italian        Latin
#  7:  7     2008               4           4           French         H.E.
#  8:  8     2008               3           2          Chinese   Beekeeping
#  9:  9     2008               2           1         Scottish       Sewing
# 10: 10     2008               1           4          English         H.E.

库（data.table）
#将data.frames转换为data.tables
setDT（X）
setDT（LU）
#连接datayear上的表和相应的代码，用于
#仅国籍数据。
谢谢@AntoniosK。有没有一种方法可以使它适应我的实际数据在许多列上循环？我应该更清楚的是，将会有几十个代码列需要处理。这就是我最初尝试使用lappy的原因。@peter\w我提供的data.table
解决方案很容易扩展到列列表。您只需要编写一个函数来包装合并步骤。加入一些列子集和重命名，你就可以开始了。谢谢@AntoniosK。有没有一种方法可以使它适应我的实际数据在许多列上循环？我应该更清楚的是，将会有几十个代码列需要处理。这就是我最初尝试使用lappy的原因。@peter\w我提供的data.table
解决方案很容易扩展到列列表。您只需要编写一个函数来包装合并步骤。加入一些列子集和重命名，您就可以开始了。谢谢@Jason Moragn，我不熟悉data.table包，但会看看这个。@peter_w我添加了一个函数，可以帮助您开始。谢谢@Jason Moragn，我不熟悉data.table软件包，但会看看它。@peter_w我添加了一个函数，可以帮助您入门。