Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/72.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
使用grep通过多列连接数据帧_R_Join_Dplyr_Grepl - Fatal编程技术网

使用grep通过多列连接数据帧

使用grep通过多列连接数据帧,r,join,dplyr,grepl,R,Join,Dplyr,Grepl,我想基于两列对两个数据帧进行完全联接,其中一列包含在另一列中找到的字符串。下面是我的两个数据帧: date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14')) site<-c("abcejams.com", "reitimes.com", "posehbc") desc1<-c("alpha", "beta", "gamma" df1<-data.frame(date, site, desc1) df1 da

我想基于两列对两个数据帧进行完全联接,其中一列包含在另一列中找到的字符串。下面是我的两个数据帧:

date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc")
desc1<-c("alpha", "beta", "gamma"
df1<-data.frame(date, site, desc1)
df1

        date         site    desc1
1 2010-11-01 abcejams.com    alpha
2 2008-03-25 reitimes.com     beta
3 2007-03-14      posehbc    gamma

date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2018-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
df2

       date2 site2 metric2 metric3
1 2010-11-01  jams       1      10
2 2008-03-25  time       2      20
3 2007-03-14  pose       3      30
4 2018-02-09  abce       4      40

任何人都有这方面的经验吗?

您可以使用
fuzzyjoin
包并使用
regex\u full\u join
。我不相信它现在在CRAN上,所以请检查安装程序

library(fuzzyjoin)

date <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14'))
site <- c("abcejams.com", "reitimes.com", "posehbc")

df1 <- data.frame(date, site, stringsAsFactors = FALSE)

date2 <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14', '2018-2-9'))
site2 <- c("jams", "time", "pose", "abce")
metric2 <- c(1, 2, 3, 4)
metric3 <- c(10, 20, 30, 40)

df2 <- data.frame(date2, site2, metric2, metric3, stringsAsFactors = FALSE)

regex_full_join(df1, df2, by = c("site" = "site2", "date" = "date2"))


            date         site      date2 site2 metric2 metric3
1 2010-11-01 abcejams.com 2010-11-01  jams       1      10
2 2008-03-25 reitimes.com 2008-03-25  time       2      20
3 2007-03-14      posehbc 2007-03-14  pose       3      30
4       <NA>         <NA> 2018-02-09  abce       4      40
库(fuzzyjoin)
日期
另一个例子

# new data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc", "poseur")
desc1<-c("alpha", "beta", "gamma", "epsilon")
df1<-data.frame(date, site, desc1)

date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2007-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)

custom_join(df1, df2)

       # date2 site2 metric2 metric3         site   desc1
# 1 2010-11-01  jams       1      10 abcejams.com   alpha
# 2 2008-03-25  time       2      20 reitimes.com    beta
# 3 2007-03-14  pose       3      30      posehbc   gamma
# 4 2007-03-14  pose       3      30       poseur epsilon
# 5 2007-02-09  abce       4      40         <NA>    <NA>
#新数据

DateThank,我收到了一个错误,上面写着:
error:只能修改普通字符向量。
您没有收到我假设的相同错误吗?请注意数据帧创建中
stringsAsFactors=FALSE
的更改。
        date         site     desc1     metric2    metric3
1 2010-11-01 abcejams.com     alpha           1         10 
2 2008-03-25 reitimes.com      beta           2         20
3 2007-03-14      posehbc     gamma           3         30
4 2018-02-09         abce        NA           4         40
library(fuzzyjoin)

date <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14'))
site <- c("abcejams.com", "reitimes.com", "posehbc")

df1 <- data.frame(date, site, stringsAsFactors = FALSE)

date2 <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14', '2018-2-9'))
site2 <- c("jams", "time", "pose", "abce")
metric2 <- c(1, 2, 3, 4)
metric3 <- c(10, 20, 30, 40)

df2 <- data.frame(date2, site2, metric2, metric3, stringsAsFactors = FALSE)

regex_full_join(df1, df2, by = c("site" = "site2", "date" = "date2"))


            date         site      date2 site2 metric2 metric3
1 2010-11-01 abcejams.com 2010-11-01  jams       1      10
2 2008-03-25 reitimes.com 2008-03-25  time       2      20
3 2007-03-14      posehbc 2007-03-14  pose       3      30
4       <NA>         <NA> 2018-02-09  abce       4      40
# original data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc")
desc1<-c("alpha", "beta", "gamma")
df1<-data.frame(date, site, desc1)

date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2018-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
library(tidyverse)
library(stringr)

make_groups <- function(bicond) {
    lapply(apply(bicond, 1, function(i) which(i == 1)), function(i) if (length(i)==0) { NA } else { i })
}

custom_join <- function(df1, df2) {
    cond1 <- outer(df2$date2, df1$date, "==")
    cond2 <- outer(as.character(df2$site2), as.character(df1$site), function(i, j) str_detect(j, i))
    bicond <- cond1 * cond2
    data1 <- df1 %>% mutate(G = row_number())
    data2 <- df2 %>% mutate(G = make_groups(bicond)) %>% unnest(G)
    full_join(data2, data1, by=c("G" = "G", "date2" = "date")) %>% select(-G)
}

custom_join(df1, df2)

       # date2 site2 metric2 metric3         site desc1
# 1 2010-11-01  jams       1      10 abcejams.com alpha
# 2 2008-03-25  time       2      20 reitimes.com  beta
# 3 2007-03-14  pose       3      30      posehbc gamma
# 4 2018-02-09  abce       4      40         <NA>  <NA>
# new data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc", "poseur")
desc1<-c("alpha", "beta", "gamma", "epsilon")
df1<-data.frame(date, site, desc1)

date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2007-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)

custom_join(df1, df2)

       # date2 site2 metric2 metric3         site   desc1
# 1 2010-11-01  jams       1      10 abcejams.com   alpha
# 2 2008-03-25  time       2      20 reitimes.com    beta
# 3 2007-03-14  pose       3      30      posehbc   gamma
# 4 2007-03-14  pose       3      30       poseur epsilon
# 5 2007-02-09  abce       4      40         <NA>    <NA>