R 整理数据：重命名列，获取非NA列名，然后收集_R_Dplyr_Tidyr_Stringr_Mutate

R 整理数据：重命名列，获取非NA列名，然后收集

R 整理数据：重命名列，获取非NA列名，然后收集,r,dplyr,tidyr,stringr,mutate,R,Dplyr,Tidyr,Stringr,Mutate,我有一些非常难看的数据需要整理，需要帮助！我的数据现在是什么样子： countries <- c("Austria", "Belgium", "Croatia") df <- tibble("age" = c(28,42,19, 67), "1_recreate_1"=c(NA,15,NA,NA), "1_recreate_2"=c(NA,10,NA,NA), "1_recreate_3"=c(NA,8,NA,NA),

我有一些非常难看的数据需要整理，需要帮助！我的数据现在是什么样子：

countries <- c("Austria", "Belgium", "Croatia")

df <- tibble("age" = c(28,42,19, 67),
         "1_recreate_1"=c(NA,15,NA,NA), 
         "1_recreate_2"=c(NA,10,NA,NA), 
         "1_recreate_3"=c(NA,8,NA,NA),
         "1_recreate_4"=c(NA,4,NA,NA),
         "1_fairness" = c(NA, 7, NA, NA),
         "1_confidence" = c(NA, 5, NA, NA),
         "2_recreate_1"=c(29,NA,NA,30),
         "2_recreate_2"=c(20,NA,NA,24),
         "2_recreate_3"=c(15,NA,NA,15),
         "2_recreate_4"=c(11,NA,NA,9),
         "2_fairness" = c(4, NA, NA, 1),
         "2_confidence" = c(5, NA, NA, 4),
         "3_recreate_1"=c(NA,NA,50,NA), 
         "3_recreate_2"=c(NA,NA,40,NA), 
         "3_recreate_3"=c(NA,NA,30,NA),
         "3_recreate_4"=c(NA,NA,20,NA),
         "3_fairness" = c(NA,  NA, 2, NA),
         "3_confidence" = c(NA, NA, 2, NA),
         "overall" = c(3,3,2,5))

2。通过获取每行非NA列的名称，创建一个名为“country”的新变量。

我用

which.max

和

names

做了一系列实验，但都没能让它完全正常工作

3。创建新变量（
recreate_1
…
recreate_4
），获取
[country_name]\u recreate_1
…
[country_name]\u recreate_4
每行的值，无论该人的国家是非NA。

也许

rowSums

是实现这一点的方法

4。使数据变长而不是变宽 我认为这需要

收集

，但我不确定如何仅从变量

国家

和

重新创建1

…

重新创建4

收集

很抱歉，这太复杂了。Tidyverse解决方案是首选，但非常感谢任何帮助

library(dplyr)
library(tidyr)
df %>% mutate(rid=row_number()) %>% 
       gather(key,val,-c(age,overall,rid, matches('recreate'))) %>% mutate(country=sub('(^\\d)_.*','\\1',key),country=countries[as.numeric(country)]) %>% 
       filter(!is.na(val)) %>% mutate(key=sub('(^\\d\\_)(.*)','\\2',key)) %>%
       spread(key,val) %>% gather(key = recreate,value = allocation,-c(rid,age,overall,Country,confidence,fairness)) %>% 
       filter(!is.na(allocation)) %>% mutate(recreate=sub('.*_(\\d$)','\\1',recreate))

这里的

（^\\d）.*

表示获取第一个数字，而

*（\\d$）

表示获取最后一个数字

这里的

（^\\d）.*

表示获取第一个数字，而

*（\\d$）

表示获取最后一个数字

一种不同的

tidyverse

可能性可能是：

df %>%
 gather(variable, allocation, na.rm = TRUE) %>%
 separate(variable, c("ID", "variable", "recreate"), convert = TRUE) %>%
 left_join(data.frame(countries) %>%
            mutate(country = countries,
                   ID = seq_along(countries)) %>%
            select(-countries), by = c("ID" = "ID")) %>%
 select(-variable, -ID) 

   recreate allocation country
      <int>      <dbl> <fct>  
 1        1         15 Austria
 2        2         10 Austria
 3        3          8 Austria
 4        4          4 Austria
 5        1         29 Belgium
 6        1         30 Belgium
 7        2         20 Belgium
 8        2         24 Belgium
 9        3         15 Belgium
10        3         15 Belgium
11        4         11 Belgium
12        4          9 Belgium
13        1         50 Croatia
14        2         40 Croatia
15        3         30 Croatia
16        4         20 Croatia

df%>%
聚集（变量，分配，na.rm=TRUE）%>%
单独（变量，c（“ID”、“变量”、“重新创建”），转换为TRUE）%>%
左联合（数据帧（国家）%>%
变异（国家=国家，
ID=沿着（国家）的顺序%>%
选择（-countries），按=c（“ID”=“ID”））%>%
选择（-variable，-ID）
重新分配国家
1 115奥地利
2 2 10奥地利
3 3 8奥地利
4奥地利
5 1 29比利时
6 1 30比利时
7 2 20比利时
8 2 24比利时
9 3 15比利时
10 3 15比利时
11 4 11比利时
12 4 9比利时
13 150克罗地亚
14 2 40克罗地亚
15 3 30克罗地亚
16 4 20克罗地亚

在这里，它首先将数据从宽格式转换为长格式，删除带有NA的行。其次，它将变量名分为三列。第三，它将国家向量转换为df，并为每个国家分配一个唯一的ID。最后，它将这两个向量连接起来并删除冗余变量

已编辑问题的解决方案：

df %>%
 select(matches("(recreate)")) %>%
 rowid_to_column() %>%
 gather(var, allocation, -rowid, na.rm = TRUE) %>%
 separate(var, c("ID", "var", "recreate"), convert = TRUE) %>%
 select(-var) %>%
 left_join(data.frame(countries) %>%
            mutate(country = countries,
                   ID = seq_along(countries)) %>%
            select(-countries), by = c("ID" = "ID")) %>% 
 left_join(df %>%
            select(-matches("(recreate)")) %>%
            rowid_to_column() %>%
            gather(var, val, -rowid, na.rm = TRUE) %>%
            mutate(var = gsub("[^[:alpha:]]", "", var)) %>%
            spread(var, val), by = c("rowid" = "rowid")) %>%
 select(-rowid, -ID)

   recreate allocation country   age confidence fairness overall
      <int>      <dbl> <fct>   <dbl>      <dbl>    <dbl>   <dbl>
 1        1         15 Austria    42          5        7       3
 2        2         10 Austria    42          5        7       3
 3        3          8 Austria    42          5        7       3
 4        4          4 Austria    42          5        7       3
 5        1         29 Belgium    28          5        4       3
 6        1         30 Belgium    67          4        1       5
 7        2         20 Belgium    28          5        4       3
 8        2         24 Belgium    67          4        1       5
 9        3         15 Belgium    28          5        4       3
10        3         15 Belgium    67          4        1       5
11        4         11 Belgium    28          5        4       3
12        4          9 Belgium    67          4        1       5
13        1         50 Croatia    19          2        2       2
14        2         40 Croatia    19          2        2       2
15        3         30 Croatia    19          2        2       2
16        4         20 Croatia    19          2        2       2

df%>%
选择（匹配项（“（重新创建）”）%>%
rowid_到_列（）%>%
聚集（变量，分配，-rowid，na.rm=TRUE）%>%
单独（变量，c（“ID”，“变量”，“重新创建”），转换为真）%>%
选择（-var）%>%
左联合（数据帧（国家）%>%
变异（国家=国家，
ID=沿着（国家）的顺序%>%
选择（-countries），按=c（“ID”=“ID”））%>%
左联合（df%>%
选择（-matches（“（重新创建）”）%>%
rowid_到_列（）%>%
聚集（var，val，-rowid，na.rm=TRUE）%>%
突变（var=gsub（“[^[：alpha:][]”，“”，var））%>%
价差（var，val），按=c（“rowid”=“rowid”））%>%
选择（-rowid，-ID）
重新分配国家年龄信心总体公平性
1115奥地利42573
2 2 10奥地利42 5 7 3
奥地利42573
4 4奥地利42 5 7 3
5129比利时28543
6130比利时67415
7220比利时28543
8224比利时67415
9 3 15比利时28 5 4 3
10315比利时67415
11 4 11比利时28 5 4 3
12 4 9比利时67 4 1 5
13150克罗地亚1922
14 2 40克罗地亚19 2 2 2
1530克罗地亚1922
1620克罗地亚1922

在这里，它首先选择包含

recreate

的列，并添加一个行ID为的列。其次，它遵循原始解决方案中的步骤。第三，它选择不包含

的列，执行从宽到长的数据转换，从列名中删除数字，并将数据转换回原始宽格式。最后，它连接两个在行ID并删除冗余变量。
一种不同的tidyverse
可能性可能是：
df %>%
 gather(variable, allocation, na.rm = TRUE) %>%
 separate(variable, c("ID", "variable", "recreate"), convert = TRUE) %>%
 left_join(data.frame(countries) %>%
            mutate(country = countries,
                   ID = seq_along(countries)) %>%
            select(-countries), by = c("ID" = "ID")) %>%
 select(-variable, -ID) 

   recreate allocation country
      <int>      <dbl> <fct>  
 1        1         15 Austria
 2        2         10 Austria
 3        3          8 Austria
 4        4          4 Austria
 5        1         29 Belgium
 6        1         30 Belgium
 7        2         20 Belgium
 8        2         24 Belgium
 9        3         15 Belgium
10        3         15 Belgium
11        4         11 Belgium
12        4          9 Belgium
13        1         50 Croatia
14        2         40 Croatia
15        3         30 Croatia
16        4         20 Croatia

df%>%
聚集（变量，分配，na.rm=TRUE）%>%
单独（变量，c（“ID”、“变量”、“重新创建”），转换为TRUE）%>%
左联合（数据帧（国家）%>%
变异（国家=国家，
ID=沿着（国家）的顺序%>%
选择（-countries），按=c（“ID”=“ID”））%>%
选择（-variable，-ID）
重新分配国家
1 115奥地利
2 2 10奥地利
3 3 8奥地利
4奥地利
5 1 29比利时
6 1 30比利时
7 2 20比利时
8 2 24比利时
9 3 15比利时
10 3 15比利时
11 4 11比利时
12 4 9比利时
13 150克罗地亚
14 2 40克罗地亚
15 3 30克罗地亚
16 4 20克罗地亚

在这里，它首先将数据从宽格式转换为长格式，删除带有NA的行。塞康
df %>%
 select(matches("(recreate)")) %>%
 rowid_to_column() %>%
 gather(var, allocation, -rowid, na.rm = TRUE) %>%
 separate(var, c("ID", "var", "recreate"), convert = TRUE) %>%
 select(-var) %>%
 left_join(data.frame(countries) %>%
            mutate(country = countries,
                   ID = seq_along(countries)) %>%
            select(-countries), by = c("ID" = "ID")) %>% 
 left_join(df %>%
            select(-matches("(recreate)")) %>%
            rowid_to_column() %>%
            gather(var, val, -rowid, na.rm = TRUE) %>%
            mutate(var = gsub("[^[:alpha:]]", "", var)) %>%
            spread(var, val), by = c("rowid" = "rowid")) %>%
 select(-rowid, -ID)

   recreate allocation country   age confidence fairness overall
      <int>      <dbl> <fct>   <dbl>      <dbl>    <dbl>   <dbl>
 1        1         15 Austria    42          5        7       3
 2        2         10 Austria    42          5        7       3
 3        3          8 Austria    42          5        7       3
 4        4          4 Austria    42          5        7       3
 5        1         29 Belgium    28          5        4       3
 6        1         30 Belgium    67          4        1       5
 7        2         20 Belgium    28          5        4       3
 8        2         24 Belgium    67          4        1       5
 9        3         15 Belgium    28          5        4       3
10        3         15 Belgium    67          4        1       5
11        4         11 Belgium    28          5        4       3
12        4          9 Belgium    67          4        1       5
13        1         50 Croatia    19          2        2       2
14        2         40 Croatia    19          2        2       2
15        3         30 Croatia    19          2        2       2
16        4         20 Croatia    19          2        2       2