R 对所有行执行操作,并将结果添加回主数据框
我有一个相当大的数据集(15000行),由于数据结构的原因,我需要对每一行进行计算。我的数据集中有一列需要进一步拆分。以下是一个例子:R 对所有行执行操作,并将结果添加回主数据框,r,loops,for-loop,R,Loops,For Loop,我有一个相当大的数据集(15000行),由于数据结构的原因,我需要对每一行进行计算。我的数据集中有一列需要进一步拆分。以下是一个例子: date <- c("2015-07-10", "2013-05-06", "2017-08-10") Number <- c(345, 231, 10) Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:5
date <- c("2015-07-10", "2013-05-06", "2017-08-10")
Number <- c(345, 231, 10)
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67")
Dep <- c("FGC","HAM","ICAO")
Plan <- data.frame(date, Number, Route, Dep)
x1 <- mean(test$Route_2)
y1 <- max(test$Route_5)
z1 <- min(test$Route_8)
x1以下是我如何使用tidyverse
软件包:
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
# This function takes a single item from Plan$Route, splits it into its
# relevant columns and then finds the mean of columns 2, 5 and 8.
route_extract <- function(route) {
cols <- str_split(route, fixed(":"), simplify = TRUE)[, c(2, 5, 8), drop = FALSE]
# Converts the matrix to numeric without losing dimensions
storage.mode(cols) <- "numeric"
# Calculate the column means and then return the result as a `tibble`
cm <- colMeans(cols)
tibble(x = cm[1], y = cm[2], z = cm[3])
}
route_calc <- function(routes) {
str_split(routes, fixed(";")) %>%
map_df(route_extract)
}
Plan <- bind_cols(Plan, route_calc(Plan$Route))
库(dplyr)
图书馆(tidyr)
图书馆(stringr)
图书馆(purrr)
#此函数从Plan$Route中提取单个项目,将其拆分为
#然后找到第2、5和8列的平均值。
route_extract以下是我如何使用tidyverse
软件包完成的:
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
# This function takes a single item from Plan$Route, splits it into its
# relevant columns and then finds the mean of columns 2, 5 and 8.
route_extract <- function(route) {
cols <- str_split(route, fixed(":"), simplify = TRUE)[, c(2, 5, 8), drop = FALSE]
# Converts the matrix to numeric without losing dimensions
storage.mode(cols) <- "numeric"
# Calculate the column means and then return the result as a `tibble`
cm <- colMeans(cols)
tibble(x = cm[1], y = cm[2], z = cm[3])
}
route_calc <- function(routes) {
str_split(routes, fixed(";")) %>%
map_df(route_extract)
}
Plan <- bind_cols(Plan, route_calc(Plan$Route))
库(dplyr)
图书馆(tidyr)
图书馆(stringr)
图书馆(purrr)
#此函数从Plan$Route中提取单个项目,将其拆分为
#然后找到第2、5和8列的平均值。
route_extract创建第二个名为route_tmp
的临时route列,并从中为每个组件生成一个单独的行,用分号分隔,然后用冒号将生成的route_tmp
变量分隔为单独的列。现在按原始变量分组,我们取所需列的平均值。(请注意,如果我们在输出中不需要Route
,那么我们可以省略顶部的mutate
,并使用Route
代替Route\u tmp
)
库(dplyr)
图书馆(tidyr)
超出%
变异(路线\ U tmp=路线)%>%
分隔行(路由tmp,sep=“;”)%>%
分离(路由_tmp,如.character(1:8),convert=TRUE)%>%
分组依据(日期、编号、路线、部门)%>%
总结(x=平均值(`2`),y=平均值(`5`),z=平均值(`8`))%>%
解组
给出以下内容(为了便于阅读,我们不显示Route列):
>输出[-3]
#一个tibble:3×6
日期编号Dep x y z
1 2013-05-06 231火腿8224.333 17 33.66667
2 2015-07-10345 FGC 8224.333 17 33.66667
3 2017-08-10 10国际民航组织8224.333 17 33.66667
注意:由于问题中的计划被覆盖,我不清楚输入的是哪个版本的计划,但我假设:
Plan <- data.frame(date = c("2015-07-10", "2013-05-06", "2017-08-10"),
Number = c(345, 231, 10),
Route = c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67"),
Dep = c("FGC","HAM","ICAO"))
Plan创建第二个名为Route\u tmp
的临时路由列,并从中为每个组件生成一个单独的行,用分号分隔,然后用冒号将生成的Route\u tmp
变量分隔为单独的列。现在按原始变量分组,我们取所需列的平均值。(请注意,如果我们在输出中不需要Route
,那么我们可以省略顶部的mutate
,并使用Route
代替Route\u tmp
)
库(dplyr)
图书馆(tidyr)
超出%
变异(路线\ U tmp=路线)%>%
分隔行(路由tmp,sep=“;”)%>%
分离(路由_tmp,如.character(1:8),convert=TRUE)%>%
分组依据(日期、编号、路线、部门)%>%
总结(x=平均值(`2`),y=平均值(`5`),z=平均值(`8`))%>%
解组
给出以下内容(为了便于阅读,我们不显示Route列):
>输出[-3]
#一个tibble:3×6
日期编号Dep x y z
1 2013-05-06 231火腿8224.333 17 33.66667
2 2015-07-10345 FGC 8224.333 17 33.66667
3 2017-08-10 10国际民航组织8224.333 17 33.66667
注意:由于问题中的计划被覆盖,我不清楚输入的是哪个版本的计划,但我假设:
Plan <- data.frame(date = c("2015-07-10", "2013-05-06", "2017-08-10"),
Number = c(345, 231, 10),
Route = c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67"),
Dep = c("FGC","HAM","ICAO"))
计划请包括您所提供的示例data.frame的所需输出。我怀疑您想要strsplit
,但不完全确定最终的data.frame。在tidyr
包中使用separate
功能可能会有所帮助here@G.Grothendieck我希望示例代码能有所帮助。请为您提供的示例data.frame提供所需的输出。我怀疑您想要strsplit
,但不完全确定最终的data.frame。在tidyr
包中使用separate
功能可能会有所帮助here@G.Grothendieck我希望示例代码对您有所帮助。
library(dplyr)
library(tidyr)
out <- Plan %>%
mutate(Route_tmp = Route) %>%
separate_rows(Route_tmp, sep = ";") %>%
separate(Route_tmp, as.character(1:8), convert = TRUE) %>%
group_by(date, Number, Route, Dep) %>%
summarize(x = mean(`2`), y = mean(`5`), z = mean(`8`)) %>%
ungroup
> out[-3]
# A tibble: 3 × 6
date Number Dep x y z
<fctr> <dbl> <fctr> <dbl> <dbl> <dbl>
1 2013-05-06 231 HAM 8224.333 17 33.66667
2 2015-07-10 345 FGC 8224.333 17 33.66667
3 2017-08-10 10 ICAO 8224.333 17 33.66667
Plan <- data.frame(date = c("2015-07-10", "2013-05-06", "2017-08-10"),
Number = c(345, 231, 10),
Route = c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67"),
Dep = c("FGC","HAM","ICAO"))