如何根据r中现有的多个列的名称部分地从这些列生成新列?
我想从现有的多个列中根据它们的名称生成新列。 这是一些近似于我的表格的数据如何根据r中现有的多个列的名称部分地从这些列生成新列?,r,R,我想从现有的多个列中根据它们的名称生成新列。 这是一些近似于我的表格的数据 id <- c("s001", "s002", "s003", "s004", "s005", "s006", "s007", "s008", "s009") group <- c(0,1,2,1,2,0,0,1,2) lh_app
id <- c("s001", "s002", "s003", "s004", "s005", "s006", "s007", "s008", "s009")
group <- c(0,1,2,1,2,0,0,1,2)
lh_apple_thickness <-c(1,2,3,4,5,6,7,8,9)
lh_banana_thickness <- c(1,3,5,7,9,11,13,15,17)
lh_orange_thickness <- c(2,4,6,8,10,12,14,16,18)
rh_apple_thickness <- c(3,7,2,1,5,4,2,6,11)
rh_banana_thickness <- c(2,4,5,4,2,4,3,1,5)
rh_orange_thickness <- c(3,6,2,4,5,6,2,5,4)
lh_apple_volume <-c(1,2,3,4,5,6,7,8,9)
lh_banana_volume <- c(1,3,5,7,9,11,13,15,17)
lh_orange_volume <- c(2,4,6,8,10,12,14,16,18)
rh_apple_volume <- c(3,7,2,1,5,4,2,6,11)
rh_banana_volume <- c(2,4,5,4,2,4,3,1,5)
rh_orange_volume <- c(3,6,2,4,5,6,2,5,4)
df <- data.frame(id,group,lh_apple_thickness,lh_banana_thickness,lh_orange_thickness,rh_apple_thickness,rh_banana_thickness,rh_orange_thickness,lh_apple_volume,lh_banana_volume,lh_orange_volume,rh_apple_volume,rh_banana_volume,rh_orange_volume)
df$group <- as.factor(df$group)
id我认为最简单的方法是先“整理”数据。例如,每个水果组合有一行
我只能分两步来做,一步是lh,一步是rh,我相信有更聪明的方法
df_lh <- df %>% dplyr::select(id, group, grep("^lh", names(.))) %>%
tidyr::pivot_longer(
cols = 3:ncol(.),
names_to = "lh", values_to = "lh_values")
df_rh <- df %>% dplyr::select(id, group, grep("^rh", names(.))) %>%
tidyr::pivot_longer(
cols = 3:ncol(.),
names_to = "rh", values_to = "rh_values")
df_result <- df_lh %>% dplyr::left_join(df_rh, by = c("id", "group")) %>%
dplyr::mutate(result = (lh_values - rh_values) / (lh_values + rh_values))
编辑
将数据放回宽格式
我假设您只想将水果与水果进行比较,将菊苣与属性(体积、厚度等)进行比较
df\u结果\u范围%
dplyr::filter(stringr::str_extract(lh,“(.)*”)==
stringr::str_extract(rh,“(.)*”),
stringr::str_extract(lh,“(.)*$”==stringr::str_extract(rh,“(.)*$”)%%>%
tidyr::枢轴_加宽(
id_cols=c(“id”,“group”),name_from=c(“lh”,“rh”),value_from=
c(“结果”))
最后将结果与原始数据放在一起
df_final <- df %>% dplyr::left_join(df_result_wide, by = c("id", "group"))
df_final%dplyr::left_join(df_result_wide,by=c(“id”,“group”))
这对我来说很有效,但前提是l列始终位于同一类别的相应r列之前(否则您必须在代码中进行一些更改):
categories%gsub(“^...-”,”,)%>%{[grepl(“.-”,)]}
df2%
sappy(函数(x)rbind(应用(x,1,函数(y)(y[1]-y[2])/(y[1]+y[2]))%>%
`colnames这里是base R选项,使用lappy
从数据中提取unique_水果
和unique_度量
,因此它可以应用于任意数量的水果以及任意数量的度量
unique_fruits <- unique(sub('.*_(.*)_.*', '\\1', names(df)[-c(1, 2)]))
unique_fruits
#[1] "apple" "banana" "orange"
unique_measure <- unique(sub('.*_', '', names(df)[-c(1, 2)]))
unique_measure
#[1] "thickness" "volume"
cbind(df, do.call(cbind, lapply(unique_fruits, function(x) {
setNames(do.call(cbind.data.frame, lapply(unique_measure, function(y) {
lhs <- sprintf('lh_%s_%s', x, y)
rhs <- sprintf('rh_%s_%s', x, y)
(df[[lhs]] - df[[rhs]])/(df[[lhs]] + df[[rhs]])
})), paste0('index_', unique_measure))
}))) -> result
unique_fruits一个简单的解决方案是首先生成所有结果和度量的组合,然后为每个可能的组合创建所需的索引。考虑这个函数:
gen_index <- function(df, ...) {
nms <- purrr::cross(list(...))
nms <- vapply(nms, function(x) paste0(as.character(x), collapse = "_"), character(1L))
lh <- paste0("lh_", nms)
rh <- paste0("rh_", nms)
res <- paste0(nms, "_index")
Reduce(function(d, i) {
`[[<-`(d, res[[i]], value = (d[[lh[[i]]]] - d[[rh[[i]]]]) / (d[[lh[[i]]]] + d[[rh[[i]]]]))
}, seq_along(nms), df)
}
这给了你
id group lh_apple_thickness lh_banana_thickness lh_orange_thickness rh_apple_thickness rh_banana_thickness rh_orange_thickness lh_apple_volume lh_banana_volume
1 s001 0 1 1 2 3 2 3 1 1
2 s002 1 2 3 4 7 4 6 2 3
3 s003 2 3 5 6 2 5 2 3 5
4 s004 1 4 7 8 1 4 4 4 7
5 s005 2 5 9 10 5 2 5 5 9
6 s006 0 6 11 12 4 4 6 6 11
7 s007 0 7 13 14 2 3 2 7 13
8 s008 1 8 15 16 6 1 5 8 15
9 s009 2 9 17 18 11 5 4 9 17
lh_orange_volume rh_apple_volume rh_banana_volume rh_orange_volume apple_thickness_index banana_thickness_index orange_thickness_index apple_volume_index
1 2 3 2 3 -0.5000000 -0.3333333 -0.2000000 -0.5000000
2 4 7 4 6 -0.5555556 -0.1428571 -0.2000000 -0.5555556
3 6 2 5 2 0.2000000 0.0000000 0.5000000 0.2000000
4 8 1 4 4 0.6000000 0.2727273 0.3333333 0.6000000
5 10 5 2 5 0.0000000 0.6363636 0.3333333 0.0000000
6 12 4 4 6 0.2000000 0.4666667 0.3333333 0.2000000
7 14 2 3 2 0.5555556 0.6250000 0.7500000 0.5555556
8 16 6 1 5 0.1428571 0.8750000 0.5238095 0.1428571
9 18 11 5 4 -0.1000000 0.5454545 0.6363636 -0.1000000
banana_volume_index orange_volume_index
1 -0.3333333 -0.2000000
2 -0.1428571 -0.2000000
3 0.0000000 0.5000000
4 0.2727273 0.3333333
5 0.6363636 0.3333333
6 0.4666667 0.3333333
7 0.6250000 0.7500000
8 0.8750000 0.5238095
9 0.5454545 0.6363636
如果只想计算数据帧子集的索引:
gen_index(df, c("apple", "banana"), "thickness")
输出
id group lh_apple_thickness lh_banana_thickness lh_orange_thickness rh_apple_thickness rh_banana_thickness rh_orange_thickness lh_apple_volume lh_banana_volume
1 s001 0 1 1 2 3 2 3 1 1
2 s002 1 2 3 4 7 4 6 2 3
3 s003 2 3 5 6 2 5 2 3 5
4 s004 1 4 7 8 1 4 4 4 7
5 s005 2 5 9 10 5 2 5 5 9
6 s006 0 6 11 12 4 4 6 6 11
7 s007 0 7 13 14 2 3 2 7 13
8 s008 1 8 15 16 6 1 5 8 15
9 s009 2 9 17 18 11 5 4 9 17
lh_orange_volume rh_apple_volume rh_banana_volume rh_orange_volume apple_thickness_index banana_thickness_index
1 2 3 2 3 -0.5000000 -0.3333333
2 4 7 4 6 -0.5555556 -0.1428571
3 6 2 5 2 0.2000000 0.0000000
4 8 1 4 4 0.6000000 0.2727273
5 10 5 2 5 0.0000000 0.6363636
6 12 4 4 6 0.2000000 0.4666667
7 14 2 3 2 0.5555556 0.6250000
8 16 6 1 5 0.1428571 0.8750000
9 18 11 5 4 -0.1000000 0.5454545
非常感谢。这个例子就是这样。事实上,我的真实数据还有其他一些列带有“u”,我只是在应用到真实数据时出错了。可能需要使用\uuuu
删除其他列。非常感谢,它确实有效。但是,我需要在原始数据框中添加这些新列。我会对问题进行编辑以使其更清楚。嗨,Ella_may,我添加了一些代码,将数据重新转换为宽格式,并与原始数据连接。让我知道它是否有效。另一件事:在我看来,以长格式工作更容易。从长远来看,如果你改变信仰,你可能会省去麻烦。祝你好运:)嗨,贾格,非常感谢你。我还有一些其他列的名称中带有。
。主要原因是这些列会导致错误。谢谢你帮我:)
gen_index(df, c("apple", "banana", "orange"), c("thickness", "volume"))
id group lh_apple_thickness lh_banana_thickness lh_orange_thickness rh_apple_thickness rh_banana_thickness rh_orange_thickness lh_apple_volume lh_banana_volume
1 s001 0 1 1 2 3 2 3 1 1
2 s002 1 2 3 4 7 4 6 2 3
3 s003 2 3 5 6 2 5 2 3 5
4 s004 1 4 7 8 1 4 4 4 7
5 s005 2 5 9 10 5 2 5 5 9
6 s006 0 6 11 12 4 4 6 6 11
7 s007 0 7 13 14 2 3 2 7 13
8 s008 1 8 15 16 6 1 5 8 15
9 s009 2 9 17 18 11 5 4 9 17
lh_orange_volume rh_apple_volume rh_banana_volume rh_orange_volume apple_thickness_index banana_thickness_index orange_thickness_index apple_volume_index
1 2 3 2 3 -0.5000000 -0.3333333 -0.2000000 -0.5000000
2 4 7 4 6 -0.5555556 -0.1428571 -0.2000000 -0.5555556
3 6 2 5 2 0.2000000 0.0000000 0.5000000 0.2000000
4 8 1 4 4 0.6000000 0.2727273 0.3333333 0.6000000
5 10 5 2 5 0.0000000 0.6363636 0.3333333 0.0000000
6 12 4 4 6 0.2000000 0.4666667 0.3333333 0.2000000
7 14 2 3 2 0.5555556 0.6250000 0.7500000 0.5555556
8 16 6 1 5 0.1428571 0.8750000 0.5238095 0.1428571
9 18 11 5 4 -0.1000000 0.5454545 0.6363636 -0.1000000
banana_volume_index orange_volume_index
1 -0.3333333 -0.2000000
2 -0.1428571 -0.2000000
3 0.0000000 0.5000000
4 0.2727273 0.3333333
5 0.6363636 0.3333333
6 0.4666667 0.3333333
7 0.6250000 0.7500000
8 0.8750000 0.5238095
9 0.5454545 0.6363636
gen_index(df, c("apple", "banana"), "thickness")
id group lh_apple_thickness lh_banana_thickness lh_orange_thickness rh_apple_thickness rh_banana_thickness rh_orange_thickness lh_apple_volume lh_banana_volume
1 s001 0 1 1 2 3 2 3 1 1
2 s002 1 2 3 4 7 4 6 2 3
3 s003 2 3 5 6 2 5 2 3 5
4 s004 1 4 7 8 1 4 4 4 7
5 s005 2 5 9 10 5 2 5 5 9
6 s006 0 6 11 12 4 4 6 6 11
7 s007 0 7 13 14 2 3 2 7 13
8 s008 1 8 15 16 6 1 5 8 15
9 s009 2 9 17 18 11 5 4 9 17
lh_orange_volume rh_apple_volume rh_banana_volume rh_orange_volume apple_thickness_index banana_thickness_index
1 2 3 2 3 -0.5000000 -0.3333333
2 4 7 4 6 -0.5555556 -0.1428571
3 6 2 5 2 0.2000000 0.0000000
4 8 1 4 4 0.6000000 0.2727273
5 10 5 2 5 0.0000000 0.6363636
6 12 4 4 6 0.2000000 0.4666667
7 14 2 3 2 0.5555556 0.6250000
8 16 6 1 5 0.1428571 0.8750000
9 18 11 5 4 -0.1000000 0.5454545