R:合并后创建后缀的自动连接变量

R:合并后创建后缀的自动连接变量,r,join,merge,R,Join,Merge,我正在寻找一种方法,在合并数据帧之后,将具有不同x和y后缀的相同变量名的值连接到一列中。例如:“name.x”和“name.y”将合并到一个名为“name”的列中,其中“name.x”值将取代“name.y”值,但“name.x”没有值的情况除外,在这种情况下将使用“name.y”。我希望对dataframe中带有后缀的所有列执行此操作 下面是我正在寻找的一个例子: df <- data.frame(ID=c(1,2,3,4,5), variable1.x=c('5.0',"",'7.9'

我正在寻找一种方法,在合并数据帧之后,将具有不同x和y后缀的相同变量名的值连接到一列中。例如:“name.x”和“name.y”将合并到一个名为“name”的列中,其中“name.x”值将取代“name.y”值,但“name.x”没有值的情况除外,在这种情况下将使用“name.y”。我希望对dataframe中带有后缀的所有列执行此操作

下面是我正在寻找的一个例子:

df <- data.frame(ID=c(1,2,3,4,5), variable1.x=c('5.0',"",'7.9','NA','12'), variable1.y=c('1.5','3.0',"",'8.9','3.9') );

ID variable1.x variable1.y
1  5.0         1.5
2              3.0
3  7.9
4  NA          8.9
5  12          3.9
我的数据示例如下:

structure(list(cikcode = c(20, 20, 20, 20, 20, 20, 20, 20, 20, 
20), yearendeddate = structure(c(3L, 3L, 1L, 1L, 2L, 2L, 4L, 
4L, 5L, 5L), .Label = c("2000-12-30", "2001-12-29", "2002-12-28", 
"2004-01-03", "2005-01-01"), class = "factor"), source = structure(c(1L, 
3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L), .Label = c("10-K", "10-K405", 
"DEF 14A"), class = "factor"), sourcedate = structure(c(5L, 6L, 
1L, 2L, 3L, 4L, 7L, 8L, 9L, 10L), .Label = c("2001-03-26", "2001-03-28", 
"2002-03-20", "2002-03-25", "2003-03-27", "2003-03-31", "2004-04-01", 
"2004-04-06", "2005-03-31", "2005-04-04"), class = "factor"), 
financialsdate = structure(c(3L, 3L, 1L, 1L, 2L, 2L, 4L, 
4L, 5L, 5L), .Label = c("2000-12-30", "2001-12-29", "2002-12-28", 
"2004-01-03", "2005-01-01"), class = "factor"), ticker.x =        structure(c(1L, 
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA), .Label = "", class = "factor"), 
statecode.x = structure(c(1L, NA, 1L, NA, 1L, NA, 1L, NA, 
1L, NA), .Label = "NJ", class = "factor"), statename.x = structure(c(1L, 
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA), .Label = "NEW JERSEY", class =   "factor"), 
siccode.x = c(3823, NA, 3823, NA, 3823, NA, 3823, NA, 3823, 
NA), naicscode.x = c(334513, NA, 334513, NA, 334513, NA, 
334513, NA, 334513, NA), auditor.x = structure(c(3L, NA, 
1L, NA, 1L, NA, 2L, NA, 2L, NA), .Label = c("Arthur Andersen LLP", 
"Grant Thornton LLP", "KPMG LLP"), class = "factor"), auditfees.x =   structure(c(5L, 
NA, 3L, NA, 1L, NA, 4L, NA, 2L, NA), .Label = c("185,000", 
"225,000", "200,000", "137,100", "123,700"), class = "factor"), 
revenue.x = structure(c(3L, NA, 1L, NA, 4L, NA, 5L, NA, 2L, 
NA), .Label = c("84,912,000", "112,494,000", "68,231,000", 
"71,819,000", "94,676,000"), class = "factor"), earnings.x = structure(c(3L, 
NA, 4L, NA, 2L, NA, 1L, NA, 1L, NA), .Label = c("", "1,048,000", 
"3,284,000", "5,838,000"), class = "factor"), assets.x = structure(c(2L, 
NA, 3L, NA, 1L, NA, 4L, NA, 5L, NA), .Label = c("47,644,000", 
"50,459,000", "54,421,000", "83,081,000", "93,016,000"), class = "factor"), 
ticker.y = c(NA, "", NA, "", NA, "", NA, "", NA, ""), statecode.y = c(NA, 
"NJ", NA, "NJ", NA, "NJ", NA, "NJ", NA, "NJ"), statename.y = c(NA, 
"NEW JERSEY", NA, "NEW JERSEY", NA, "NEW JERSEY", NA, "NEW JERSEY", 
NA, "NEW JERSEY"), siccode.y = c(NA, 3823, NA, 3823, NA, 
3823, NA, 3823, NA, 3823), naicscode.y = c(NA, "334513", 
NA, "334513", NA, "334513", NA, "334513", NA, "334513"), 
auditor.y = c(NA, "KPMG LLP", NA, "Arthur Andersen LLP", 
NA, "Arthur Andersen LLP", NA, "Grant Thornton LLP", NA, 
"Grant Thornton LLP"), auditfees.y = c(NA, "123,700", NA, 
"200,000", NA, "185,000", NA, "137,100", NA, "225,000"), 
revenue.y = c(NA, "68,231,000", NA, "84,912,000", NA, "71,819,000", 
NA, "94,676,000", NA, "112,494,000"), earnings.y = c(NA, 
"3,284,000", NA, "5,838,000", NA, "1,048,000", NA, "", NA, 
""), assets.y = c(NA, "50,459,000", NA, "54,421,000", NA, 
"47,644,000", NA, "83,081,000", NA, "93,016,000")), .Names = c("cikcode", 
"yearendeddate", "source", "sourcedate", "financialsdate", "ticker.x", 
"statecode.x", "statename.x", "siccode.x", "naicscode.x", "auditor.x", 
"auditfees.x", "revenue.x", "earnings.x", "assets.x", "ticker.y", 
"statecode.y", "statename.y", "siccode.y", "naicscode.y", "auditor.y", 
"auditfees.y", "revenue.y", "earnings.y", "assets.y"), row.names = c(NA, 
10L), class = "data.frame")

根据显示的示例,我认为您需要pmax。如果需要为除或NA以外的值保留“variable1.x”值,可以使用ifelse。在本例中,NA不是真实的NA。您不需要引用NA值。如果是真的,我们可以使用is.nadf[,2]

如果有多个列,例如“df2”,我在这里使用的是真正的NAs,我们在删除后缀部分I.x.“x”,“y”和sub后拆分“variable”列。使用lappy循环列表元素,并使用ifelse获得结果,如上所示。另外,请注意,我在这里使用stringsAsFactors=FALSE创建了“character”列

使现代化 基于OP的post“df2”中的dput输出

 #subset the column names that have either `.x` or `.y` at the end
 v1 <- grep('\\.(x|y)$', names(df2), value=TRUE)
 #create another dataset with all columns except the ones in "v1"
 df2N <- df2[setdiff(names(df2), v1)]

 #change the 'factor' columns in 'v1' to 'character'    
 indx <- sapply(df2[v1], is.factor)
 df2[v1][indx] <- lapply(df2[v1][indx], as.character)

 #remove the suffix part ('.x', 'y') with `sub`
 nm1 <-sub('\\..*$', '', v1)
 #sort the unique elements of 'nm1' for naming new columns
 nm2 <- sort(unique(nm1))
 #create new columns after `split`ting the 'v1' with 'nm1'
 #merge the columns with the condition that if the first column
 # i.e. '.x' has NA or empty strings, replace that with the
 # second column element 
 df2N[nm2] <- lapply(split(v1, nm1), function(x) {
                  x1 <- df2[x]
      ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1])})
 head(df2N,3)
 #  cikcode yearendeddate  source sourcedate financialsdate     assets auditfees
 #1      20    2002-12-28    10-K 2003-03-27     2002-12-28 50,459,000   123,700
 #2      20    2002-12-28 DEF 14A 2003-03-31     2002-12-28 50,459,000   123,700
 #3      20    2000-12-30    10-K 2001-03-26     2000-12-30 54,421,000   200,000
 #              auditor  earnings naicscode    revenue siccode statecode
 #1            KPMG LLP 3,284,000    334513 68,231,000    3823        NJ
 #2            KPMG LLP 3,284,000    334513 68,231,000    3823        NJ
 #3 Arthur Andersen LLP 5,838,000    334513 84,912,000    3823        NJ
 #   statename ticker
 #1 NEW JERSEY   <NA>
 #2 NEW JERSEY       
 #3 NEW JERSEY   <NA>
数据
处理此问题的另一种方法是在合并时处理它

我的包功能可以处理这些冲突

# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)

df1 <- data.frame(ID=c(1,2,3,4,5), variable1=c('5.0',"",'7.9','NA','12'))
df2 <- data.frame(ID=c(1,2,3,4,5), variable1=c('1.5','3.0',"",'8.9','3.9'))

safe_left_join(df1, df2, by = "ID", conflict = ~ dplyr::coalesce(
  as.numeric(as.character(.x)), as.numeric(as.character(.y))))
#   ID variable1
# 1  1       5.0
# 2  2       3.0
# 3  3       7.9
# 4  4       8.9
# 5  5      12.0

嘿,akrun,一个例子刚刚添加了你现在在线的书。更新的任何反馈?我一直在尝试将其中3个变量转换为字符、因子或数字,但没有任何效果。在转换了前面的6个变量之后,R对我来说只是冻结了。这太令人沮丧了。好吧,我又运行了一次。它仍然会给我相同的错误,未定义的列被选中,NAs被强制引入使用您的dput输出,但我没有收到任何错误。谢谢,但我正在尝试在一个数据帧中合并多个列相同的名称,只是不同的后缀,而不是在多个数据帧中。我不确定这是如何正确应用的。我还需要在不仅仅是ID的基础上进行合并。现在我有了单独的行,我只希望合并每行的值,忽略没有后缀的任何其他变量,即。无需在此处使用ID,因为ID不一定对每行唯一,这会影响结果吗?将我的变量名输入到您的代码中我运行了以下nm1我意识到这需要先测试,然后才能测试其他变量
nm1 <- sub('\\..*$', '', names(df1)[-1])
df2 <- data.frame(ID= df1$ID)
df2[unique(nm1)] <-  lapply(split(names(df1)[-1],nm1), function(x) {
           x1 <- df1[x]
          as.numeric(ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1]))})
df2
#   ID variable1 variable2
#1  1       5.0       4.2
#2  2       3.0       3.5
#3  3       7.9       3.2
#4  4       8.9       1.2
#5  5      12.0       4.0
  df2[unique(nm1)] <-  lapply(split(names(df1)[-1], nm1), function(x) {
      x1 <- df1[x]
      as.numeric(x1[cbind(1:nrow(x1),((is.na(x1[,1])|x1[,1]==''))+1L)])})
 #subset the column names that have either `.x` or `.y` at the end
 v1 <- grep('\\.(x|y)$', names(df2), value=TRUE)
 #create another dataset with all columns except the ones in "v1"
 df2N <- df2[setdiff(names(df2), v1)]

 #change the 'factor' columns in 'v1' to 'character'    
 indx <- sapply(df2[v1], is.factor)
 df2[v1][indx] <- lapply(df2[v1][indx], as.character)

 #remove the suffix part ('.x', 'y') with `sub`
 nm1 <-sub('\\..*$', '', v1)
 #sort the unique elements of 'nm1' for naming new columns
 nm2 <- sort(unique(nm1))
 #create new columns after `split`ting the 'v1' with 'nm1'
 #merge the columns with the condition that if the first column
 # i.e. '.x' has NA or empty strings, replace that with the
 # second column element 
 df2N[nm2] <- lapply(split(v1, nm1), function(x) {
                  x1 <- df2[x]
      ifelse(is.na(x1[,1])|x1[,1]=='', x1[,2], x1[,1])})
 head(df2N,3)
 #  cikcode yearendeddate  source sourcedate financialsdate     assets auditfees
 #1      20    2002-12-28    10-K 2003-03-27     2002-12-28 50,459,000   123,700
 #2      20    2002-12-28 DEF 14A 2003-03-31     2002-12-28 50,459,000   123,700
 #3      20    2000-12-30    10-K 2001-03-26     2000-12-30 54,421,000   200,000
 #              auditor  earnings naicscode    revenue siccode statecode
 #1            KPMG LLP 3,284,000    334513 68,231,000    3823        NJ
 #2            KPMG LLP 3,284,000    334513 68,231,000    3823        NJ
 #3 Arthur Andersen LLP 5,838,000    334513 84,912,000    3823        NJ
 #   statename ticker
 #1 NEW JERSEY   <NA>
 #2 NEW JERSEY       
 #3 NEW JERSEY   <NA>
set.seed(24)
df1 <- data.frame(ID=1:5, variable1.x= c(5.0, '', 7.9, NA, 12), 
 variable1.y=c(1.5, 3.0, '', 8.9, 3.9), variable2.x= c(4.2, 3.5, '', NA, 4), 
 variable2.y=c(1.2, 1.5, 3.2, 1.2, NA), stringsAsFactors=FALSE)
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)

df1 <- data.frame(ID=c(1,2,3,4,5), variable1=c('5.0',"",'7.9','NA','12'))
df2 <- data.frame(ID=c(1,2,3,4,5), variable1=c('1.5','3.0',"",'8.9','3.9'))

safe_left_join(df1, df2, by = "ID", conflict = ~ dplyr::coalesce(
  as.numeric(as.character(.x)), as.numeric(as.character(.y))))
#   ID variable1
# 1  1       5.0
# 2  2       3.0
# 3  3       7.9
# 4  4       8.9
# 5  5      12.0
safe_left_join(df1, df2, by = "ID", conflict = dplyr::coalesce)