R中嵌套For循环的替代方法

R中嵌套For循环的替代方法,r,for-loop,dplyr,R,For Loop,Dplyr,我有两个数据集: 竞争对手数据-包含给定产品的竞争对手以及收集竞争对手价格的价格和日期 产品价格-每次价格变动的日期 competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), crawl_date=c("2014-0

我有两个数据集: 竞争对手数据-包含给定产品的竞争对手以及收集竞争对手价格的价格和日期

产品价格-每次价格变动的日期

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                   "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                            competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
                            competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$crawl_date = as.Date(competitor_data$crawl_date)

competitor_data下面的解决方案使用dplyr连接进行匹配。(注意:我将“crawl_date”更改为“date”,这样dplyr join将自动选择匹配的列

by=c('productId'='productId', date'='crawl_date')  
作为要联接的参数

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$date = as.Date(competitor_data$date)

product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

require(dplyr)
joined <- product_price %>% left_join(competitor_data)
joined$leader <- as.integer(joined$price <= joined$competitor_price)

joined

competitor\u data下面的解决方案使用dplyr join进行匹配。(注意:我将“crawl\u date”更改为“date”,以便dplyr join自动选择匹配的列。它可以通过以下方式显式匹配

by=c('productId'='productId', date'='crawl_date')  
作为要联接的参数

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$date = as.Date(competitor_data$date)

product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

require(dplyr)
joined <- product_price %>% left_join(competitor_data)
joined$leader <- as.integer(joined$price <= joined$competitor_price)

joined

competitor_data
competitor_data
competitor_data为什么不按产品id和日期合并这两个数据集,然后比较这两个价格列,因为爬网日期不一定映射到日期。请查看我的代码中的if语句。因此,您是在下一个最近的日期选择价格,因此在合并后,使用最后一个观察值填写NAS的rward函数这仍然使用for循环,对吗?您可以发布您的解决方案吗?为什么不按产品id和日期合并这两个数据集,然后比较两个价格列,因为爬网日期不一定映射到日期。请查看我的代码中的if语句。因此,您将在下一个最近的日期选择价格,所以在合并之后使用最后一个观察结转函数来填写NAS。这仍然使用for循环,对吗?你能发布你的解决方案吗?缺少的是我的if语句。日期和爬网日期不一定相同。对于给定的日期,我们采用最接近的爬网日期(日期之前的爬网日期)。请查看我的if语句。我输入了逻辑。缺少的是我的if语句。日期和爬网日期不一定相同。对于给定的日期,我们使用最接近的爬网日期(日期之前的爬网日期)。请查看我的if语句。我输入了逻辑。
   productId       date price competitor competitor_price leader
1     banana 2014-05-05  2.12     google             1.99      0
2     banana 2014-06-22  2.31    tencent             2.52      1
3     banana 2014-07-05  2.29       <NA>               NA     NA
4     banana 2014-08-31  2.01       <NA>               NA     NA
5     banana 2014-05-03  2.04       <NA>               NA     NA
6     banana 2014-02-22  2.09       <NA>               NA     NA
7        fig 2014-05-21  5.22       <NA>               NA     NA
8        fig 2014-06-19  5.36       <NA>               NA     NA
9        fig 2014-03-09  5.21       <NA>               NA     NA
10       fig 2014-06-22  5.91       <NA>               NA     NA
11       fig 2014-07-03  5.36       <NA>               NA     NA
12       fig 2014-09-08  5.56       <NA>               NA     NA
> 
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)
## fill in NAs
f <- function(..., lead = NA) {
  # f(NA, 1, NA, 2, NA, NA, lead = NULL)
  x <- c(lead, c(...))
  head(zoo::na.locf(zoo::na.locf(x, na.rm = FALSE), fromLast = TRUE),
       if (is.null(lead)) length(x) else -length(lead))
}
dd <- merge(product_price, competitor_data,
            by.y = c('productId', 'crawl_date'),
            by.x = c('productId', 'date'), all = TRUE)
dd$competitor_price <-
  unlist(sapply(split(dd$competitor_price, dd$productId), f))
dd$price_leader <- +(dd$price <= dd$competitor_price)
(res1 <- `rownames<-`(dd[!is.na(dd$price_leader), -4], NULL))

#    productId       date price competitor_price price_leader
# 1     banana 2014-02-22  2.09             2.50            1
# 2     banana 2014-05-03  2.04             2.35            1
# 3     banana 2014-05-05  2.12             2.35            1
# 4     banana 2014-06-22  2.31             2.22            0
# 5     banana 2014-07-05  2.29             2.52            1
# 6     banana 2014-08-31  2.01             2.52            1
# 7        fig 2014-03-09  5.21             5.32            1
# 8        fig 2014-05-21  5.22             5.32            1
# 9        fig 2014-06-19  5.36             5.56            1
# 10       fig 2014-06-22  5.91             5.56            0
# 11       fig 2014-07-03  5.36             5.86            1
# 12       fig 2014-09-08  5.56             5.96            1

res0 <- `rownames<-`(all_competitive_data[
  order(all_competitive_data$productId, all_competitive_data$date), ], NULL)

all.equal(res0, res1)
# [1] TRUE
library('dplyr')
dd <- full_join(product_price, competitor_data,
                by = c(
                  'productId' = 'productId',
                  'date' = 'crawl_date'
                )
) %>% arrange(productId, date)

dd %>% group_by(productId) %>%
  mutate(
    competitor_price = f(competitor_price),
    price_leader = as.integer(price <= competitor_price)
) %>% filter(!is.na(price_leader)) %>% select(-competitor)

# Source: local data frame [12 x 5]
# Groups: productId [2]
# 
#      productId       date price competitor_price price_leader
#          <chr>     <date> <dbl>            <dbl>        <int>
#   1     banana 2014-02-22  2.09             2.50            1
#   2     banana 2014-05-03  2.04             2.35            1
#   3     banana 2014-05-05  2.12             2.35            1
#   4     banana 2014-06-22  2.31             2.22            0
#   5     banana 2014-07-05  2.29             2.52            1
#   6     banana 2014-08-31  2.01             2.52            1
#   7        fig 2014-03-09  5.21             5.32            1
#   8        fig 2014-05-21  5.22             5.32            1
#   9        fig 2014-06-19  5.36             5.56            1
#   10       fig 2014-06-22  5.91             5.56            0
#   11       fig 2014-07-03  5.36             5.86            1
#   12       fig 2014-09-08  5.56             5.96            1