将一列重新排列为两列数据帧r

将一列重新排列为两列数据帧r,r,dplyr,tidyverse,R,Dplyr,Tidyverse,我有一个很大的文件,它的组织方式非常不方便,所有值都在一列中,每个单元格有七个值,最后两行除外,如下所示: df <- c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)', '(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)', '(98690=9) (98920=3) (98930=5) (98940=5

我有一个很大的文件,它的组织方式非常不方便,所有值都在一列中,每个单元格有七个值,最后两行除外,如下所示:

df <- c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)',
'(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)',
'(98690=9) (98920=3) (98930=5) (98940=5) (98950=9) (98990=11) (99900=-1)',
'(99910=11) (99920=-1) (99930=11)',
'(-1=-1) (-2=-1) (99999=-1)')

df经典地使用
gsub
strsplit

df <- 
  data.frame(matrix(as.double(unlist(strsplit(gsub("[\\(\\)]", "", v), "=|\\s"))),,2, b=T))

#       V1 V2
# 1  98440  9
# 2  98450  9
# 3  98500  9
# 4  98520  9
# 5  98530  9
# 6  98540  9
# 7  98550  9
# 8  98555  9
# 9  98560  9
# 10 98570  9
# 11 98590  9
# 12 98600  9
# 13 98620  9
# 14 98630  9
# 15 98690  9
# 16 98920  3
# 17 98930  5
# 18 98940  5
# 19 98950  9
# 20 98990 11
# 21 99900 -1
# 22 99910 11
# 23 99920 -1
# 24 99930 11
# 25    -1 -1
# 26    -2 -1
# 27 99999 -1
df两种解决方案:

这里有一个两步解决方案,使用
stru extract
从包
stringr
中提取

第一步-将价值链拆分为单个值:

df1 <- unlist(strsplit(df, " "))
或者,这里有一个使用
str\u extract\u all
的一步解决方案:

df1 <- data.frame(
  col1 = unlist(str_extract_all(df, "(-)?\\d+(?==)")),
  col2 = unlist(str_extract_all(df, "(?<==)(-)?\\d+"))
)
结果:

df2
    col1 col2
1  98440    9
2  98450    9
3  98500    9
4  98520    9
5  98530    9
6  98540    9
7  98550    9
8  98555    9
9  98560    9
10 98570    9
11 98590    9
12 98600    9
13 98620    9
14 98630    9
15 98690    9
16 98920    3
17 98930    5
18 98940    5
19 98950    9
20 98990   11
21 99900   -1
22 99910   11
23 99920   -1
24 99930   11
25    -1   -1
26    -2   -1
27 99999   -1

tidyverse设计的目的就是为了不方便地整理凌乱的数据

library(tidyverse)

c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)',
  '(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)',
  '(98690=9) (98920=3) (98930=5) (98940=5) (98950=9) (98990=11) (99900=-1)',
  '(99910=11) (99920=-1) (99930=11)',
  '(-1=-1) (-2=-1) (99999=-1)') %>%

  purrr::map(~ str_split(.," ")) %>% # split each string into its individual components

  unlist() %>%

  tibble::enframe(NULL,"Numbers") %>%

  dplyr::mutate(Numbers = str_replace_all(Numbers,"[()]","")) %>% # remove the brackets as they are unnecessary

  tidyr::separate(Numbers,c("Number 1","Number 2"),sep = "=") # separate using "=" as the separator

如果您在第一步中将数据帧而不是字符向量传递给它,同样的代码也会起作用。

对不起,伙计们,但我忘了提到数据是在数据帧中组织的。如果df
df1 <- data.frame(
  col1 = unlist(str_extract_all(df, "(-)?\\d+(?==)")),
  col2 = unlist(str_extract_all(df, "(?<==)(-)?\\d+"))
)
df1 <- data.frame(
  col1 = unlist(str_extract_all(df$Var1, "(-)?\\d+(?==)")),
  col2 = unlist(str_extract_all(df$Var1, "(?<==)(-)?\\d+"))
)
df2
    col1 col2
1  98440    9
2  98450    9
3  98500    9
4  98520    9
5  98530    9
6  98540    9
7  98550    9
8  98555    9
9  98560    9
10 98570    9
11 98590    9
12 98600    9
13 98620    9
14 98630    9
15 98690    9
16 98920    3
17 98930    5
18 98940    5
19 98950    9
20 98990   11
21 99900   -1
22 99910   11
23 99920   -1
24 99930   11
25    -1   -1
26    -2   -1
27 99999   -1
library(tidyverse)

c('(98440=9) (98450=9) (98500=9) (98520=9) (98530=9) (98540=9) (98550=9)',
  '(98555=9) (98560=9) (98570=9) (98590=9) (98600=9) (98620=9) (98630=9)',
  '(98690=9) (98920=3) (98930=5) (98940=5) (98950=9) (98990=11) (99900=-1)',
  '(99910=11) (99920=-1) (99930=11)',
  '(-1=-1) (-2=-1) (99999=-1)') %>%

  purrr::map(~ str_split(.," ")) %>% # split each string into its individual components

  unlist() %>%

  tibble::enframe(NULL,"Numbers") %>%

  dplyr::mutate(Numbers = str_replace_all(Numbers,"[()]","")) %>% # remove the brackets as they are unnecessary

  tidyr::separate(Numbers,c("Number 1","Number 2"),sep = "=") # separate using "=" as the separator