将一列重新排列为两列数据帧r
我有一个很大的文件,它的组织方式非常不方便,所有值都在一列中,每个单元格有七个值,最后两行除外,如下所示:将一列重新排列为两列数据帧r,r,dplyr,tidyverse,R,Dplyr,Tidyverse,我有一个很大的文件,它的组织方式非常不方便,所有值都在一列中,每个单元格有七个值,最后两行除外,如下所示: df <- c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)', '(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)', '(98690=9) (98920=3) (98930=5) (98940=5
df <- c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)',
'(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)',
'(98690=9) (98920=3) (98930=5) (98940=5) (98950=9) (98990=11) (99900=-1)',
'(99910=11) (99920=-1) (99930=11)',
'(-1=-1) (-2=-1) (99999=-1)')
df经典地使用gsub
和strsplit
df <-
data.frame(matrix(as.double(unlist(strsplit(gsub("[\\(\\)]", "", v), "=|\\s"))),,2, b=T))
# V1 V2
# 1 98440 9
# 2 98450 9
# 3 98500 9
# 4 98520 9
# 5 98530 9
# 6 98540 9
# 7 98550 9
# 8 98555 9
# 9 98560 9
# 10 98570 9
# 11 98590 9
# 12 98600 9
# 13 98620 9
# 14 98630 9
# 15 98690 9
# 16 98920 3
# 17 98930 5
# 18 98940 5
# 19 98950 9
# 20 98990 11
# 21 99900 -1
# 22 99910 11
# 23 99920 -1
# 24 99930 11
# 25 -1 -1
# 26 -2 -1
# 27 99999 -1
df两种解决方案:
这里有一个两步解决方案,使用stru extract
从包stringr
中提取
第一步-将价值链拆分为单个值:
df1 <- unlist(strsplit(df, " "))
或者,这里有一个使用str\u extract\u all
的一步解决方案:
df1 <- data.frame(
col1 = unlist(str_extract_all(df, "(-)?\\d+(?==)")),
col2 = unlist(str_extract_all(df, "(?<==)(-)?\\d+"))
)
结果:
df2
col1 col2
1 98440 9
2 98450 9
3 98500 9
4 98520 9
5 98530 9
6 98540 9
7 98550 9
8 98555 9
9 98560 9
10 98570 9
11 98590 9
12 98600 9
13 98620 9
14 98630 9
15 98690 9
16 98920 3
17 98930 5
18 98940 5
19 98950 9
20 98990 11
21 99900 -1
22 99910 11
23 99920 -1
24 99930 11
25 -1 -1
26 -2 -1
27 99999 -1
tidyverse设计的目的就是为了不方便地整理凌乱的数据
library(tidyverse)
c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)',
'(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)',
'(98690=9) (98920=3) (98930=5) (98940=5) (98950=9) (98990=11) (99900=-1)',
'(99910=11) (99920=-1) (99930=11)',
'(-1=-1) (-2=-1) (99999=-1)') %>%
purrr::map(~ str_split(.," ")) %>% # split each string into its individual components
unlist() %>%
tibble::enframe(NULL,"Numbers") %>%
dplyr::mutate(Numbers = str_replace_all(Numbers,"[()]","")) %>% # remove the brackets as they are unnecessary
tidyr::separate(Numbers,c("Number 1","Number 2"),sep = "=") # separate using "=" as the separator
如果您在第一步中将数据帧而不是字符向量传递给它,同样的代码也会起作用。对不起,伙计们,但我忘了提到数据是在数据帧中组织的。如果df
df1 <- data.frame(
col1 = unlist(str_extract_all(df, "(-)?\\d+(?==)")),
col2 = unlist(str_extract_all(df, "(?<==)(-)?\\d+"))
)
df1 <- data.frame(
col1 = unlist(str_extract_all(df$Var1, "(-)?\\d+(?==)")),
col2 = unlist(str_extract_all(df$Var1, "(?<==)(-)?\\d+"))
)
df2
col1 col2
1 98440 9
2 98450 9
3 98500 9
4 98520 9
5 98530 9
6 98540 9
7 98550 9
8 98555 9
9 98560 9
10 98570 9
11 98590 9
12 98600 9
13 98620 9
14 98630 9
15 98690 9
16 98920 3
17 98930 5
18 98940 5
19 98950 9
20 98990 11
21 99900 -1
22 99910 11
23 99920 -1
24 99930 11
25 -1 -1
26 -2 -1
27 99999 -1
library(tidyverse)
c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)',
'(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)',
'(98690=9) (98920=3) (98930=5) (98940=5) (98950=9) (98990=11) (99900=-1)',
'(99910=11) (99920=-1) (99930=11)',
'(-1=-1) (-2=-1) (99999=-1)') %>%
purrr::map(~ str_split(.," ")) %>% # split each string into its individual components
unlist() %>%
tibble::enframe(NULL,"Numbers") %>%
dplyr::mutate(Numbers = str_replace_all(Numbers,"[()]","")) %>% # remove the brackets as they are unnecessary
tidyr::separate(Numbers,c("Number 1","Number 2"),sep = "=") # separate using "=" as the separator