Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/73.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 基于由(,)连接并由空格分隔的字符向量对创建数据帧_R_Dataframe_Data.table - Fatal编程技术网

R 基于由(,)连接并由空格分隔的字符向量对创建数据帧

R 基于由(,)连接并由空格分隔的字符向量对创建数据帧,r,dataframe,data.table,R,Dataframe,Data.table,我有以下data.frame: b<-structure(list(b = c("47.83006,11.71699 47.83004,11.71691 47.83002,11.7168 47.83001,11.71662", "47.83001,11.71662 47.82993,11.71628 47.82991,11.7162 47.82988,11.71614 47.82983,11.71609 47.8295,11.71588 47.82919,1

我有以下data.frame:

b<-structure(list(b = c("47.83006,11.71699 47.83004,11.71691 47.83002,11.7168 47.83001,11.71662", 
"47.83001,11.71662 47.82993,11.71628 47.82991,11.7162 47.82988,11.71614 47.82983,11.71609 47.8295,11.71588 47.82919,11.71566 47.82898,11.71549 47.82845,11.71504 47.82832,11.715 47.82821,11.715 47.82712,11.71531 47.82639,11.71549 47.82606,11.71561 47.8257,11.71567 47.82548,11.71574 47.82433,11.71613", 
"47.82433,11.71613 47.82436,11.7165 47.8244,11.71715 47.82442,11.71742 47.82453,11.71823 47.82459,11.71856 47.82492,11.7199", 
"47.82492,11.7199 47.82495,11.72005 47.82503,11.72034 47.82515,11.72066 47.82526,11.72093 47.82556,11.72172 47.82559,11.72182 47.82561,11.72191 47.82562,11.72201", 
"47.85051,12.11965 47.85092,12.11997", "48.10034,11.75948 48.10021,11.75938"
)), row.names = c(NA, 6L), class = "data.frame")
更新 感谢您提供的解决方案。 我会选择@Gki方案,因为它更快:

Unit: milliseconds
                                                                                                                         expr
 c <- b %>% separate_rows(b, sep = " ") %>% separate(b, into = c("Lat",      "Lon"), sep = ",", convert = T) %>% data.frame()
                                     d <- read.csv(text = unlist(strsplit(b$b, " ", TRUE)), col.names = c("Lat",      "Lon"))
       min        lq      mean    median        uq       max neval
 12.363628 13.031700 14.027860 13.408883 13.703157 28.922909   100
  1.020622  1.050315  1.119533  1.117269  1.170826  1.348833   100
单位:毫秒
expr
c%分隔行(b,sep=“”)%%>%分隔行(b,into=c(“Lat”,“Lon”),sep=“”,convert=T)%%>%data.frame()

d如果你能在
tidyr

  • 使用带有
    sep=''
    参数的
    separate_rows
    将额外的一对值分隔成新行
  • 使用
    sep='、“
    参数以及将值同时转换为数值的
    convert=T
    将lat和lon值分离到不同的列中

b%单独的行(b,sep='')%>%
分离(b,转换为=c('Lat','Lon'),sep=','convert=T)%>%
data.frame()
#>拉特隆
#> 1  47.83006 11.71699
#> 2  47.83004 11.71691
#> 3  47.83002 11.71680
#> 4  47.83001 11.71662
#> 5  47.83001 11.71662
#> 6  47.82993 11.71628
#> 7  47.82991 11.71620
#> 8  47.82988 11.71614
#> 9  47.82983 11.71609
#> 10 47.82950 11.71588
#> 11 47.82919 11.71566
#> 12 47.82898 11.71549
#> 13 47.82845 11.71504
#> 14 47.82832 11.71500
#> 15 47.82821 11.71500
#> 16 47.82712 11.71531
#> 17 47.82639 11.71549
#> 18 47.82606 11.71561
#> 19 47.82570 11.71567
#> 20 47.82548 11.71574
#> 21 47.82433 11.71613
#> 22 47.82433 11.71613
#> 23 47.82436 11.71650
#> 24 47.82440 11.71715
#> 25 47.82442 11.71742
#> 26 47.82453 11.71823
#> 27 47.82459 11.71856
#> 28 47.82492 11.71990
#> 29 47.82492 11.71990
#> 30 47.82495 11.72005
#> 31 47.82503 11.72034
#> 32 47.82515 11.72066
#> 33 47.82526 11.72093
#> 34 47.82556 11.72172
#> 35 47.82559 11.72182
#> 36 47.82561 11.72191
#> 37 47.82562 11.72201
#> 38 47.85051 12.11965
#> 39 47.85092 12.11997
#> 40 48.10034 11.75948
#> 41 48.10021 11.75938

由(v2.0.0)于2021年5月18日创建。您可以使用
strsplit
按值之间的空格分割,然后使用
read.csv
获取
数据.frame

read.csv(text=unlist(strsplit(b$b, " ", TRUE)), col.names = c("Lat", "Lon"))
#        Lat      Lon
#1  47.83004 11.71691
#2  47.83002 11.71680
#3  47.83001 11.71662
#4  47.83001 11.71662
#5  47.82993 11.71628
#6  47.82991 11.71620
#7  47.82988 11.71614
#...
或者从R 4.1.0开始,在base中使用前向管道操作符
|>
和功能快捷键
\()

或使用bizarro管道
->而不是定义函数:

strsplit(b$b, " ", TRUE) |> unlist() ->.; read.csv(text=., col.names = c("Lat", "Lon"))
当跳过设置列标题时,转换为数字并以矩阵作为结果的快速方法将是:

do.call(rbind, strsplit(unlist(strsplit(b$b, " ", TRUE)), ",", TRUE))
或将其转换为数字:

matrix(as.numeric(unlist(strsplit(unlist(strsplit(b$b, " ", TRUE)), ",", TRUE))), ncol=2, byrow=TRUE)
使用@mt1022的
数据表
解决方案进行比较:

library(data.table)
microbenchmark::microbenchmark(
  base = do.call(rbind, strsplit(unlist(strsplit(b$b, " ", TRUE)), ",", TRUE))
, baseNum = matrix(as.numeric(unlist(strsplit(unlist(strsplit(b$b, " ", TRUE)), ",", TRUE))), ncol=2, byrow=TRUE)
, data.table = as.data.table(tstrsplit(unlist(strsplit(b$b, ' ', T)), ',', T))
)
#Unit: microseconds
#       expr     min       lq      mean   median       uq     max neval cld
#       base  28.829  30.2965  33.08313  31.5705  33.0475  85.880   100  a 
#    baseNum  29.832  31.3030  33.51445  32.3635  34.5395  56.851   100  a 
# data.table 143.745 147.9900 155.41194 150.9960 157.2420 278.190   100   b

这个解决方案远没有GKI和AnilGoyal提供的那么好。但它是有效的。下面是将
separate
pivot\u longer

library(dplyr)
library(tidyr)
b %>%  
  separate(b, c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l","m", "n", "o", "p", "q"), sep=" ",  extra = "drop", fill = "right") %>% 
  pivot_longer(
    cols=everything()
  ) %>% 
  drop_na() %>% 
  separate(value, c("lat", "long"), sep=",") %>% 
  select(-name)

输出:

   lat      long    
   <chr>    <chr>   
 1 47.83006 11.71699
 2 47.83004 11.71691
 3 47.83002 11.7168 
 4 47.83001 11.71662
 5 47.83001 11.71662
 6 47.82993 11.71628
 7 47.82991 11.7162 
 8 47.82988 11.71614
 9 47.82983 11.71609
10 47.8295  11.71588
lat-long
1 47.83006 11.71699
2 47.83004 11.71691
3 47.83002 11.7168 
4 47.83001 11.71662
5 47.83001 11.71662
6 47.82993 11.71628
7 47.82991 11.7162 
8 47.82988 11.71614
9 47.82983 11.71609
10 47.8295  11.71588

A
数据。表
此处稍快的解决方案:

library(microbenchmark)
library(data.table)
microbenchmark(
    base = read.csv(text=unlist(strsplit(b$b, " ", TRUE)), col.names = c("Lat", "Lon")),
    data.table = as.data.table(tstrsplit(unlist(strsplit(b$b, ' ', T)), ',', T))
)
 # Unit: microseconds
 #       expr     min       lq     mean   median       uq     max neval
 #       base 354.102 360.5485 377.0983 371.2665 380.6985 527.916   100
 # data.table 151.252 161.8555 177.9840 178.1130 184.3945 348.759   100

这要归功于GKi的baseR R解决方案。

出色的baseR方法。向上投票+1谢谢!遗憾的是,我无法使用前向管道操作符方法,因为我的R版本可能比4.05版本稍旧一些,但csv方法非常快。很好的解决方案(从未听说过基本的R管道操作符),但在微秒上进行工作台设置几乎毫无意义IMO@DavidArenburg从今天(4.1.0)起,底座中的管道是新的。是的,长凳是没有意义的,只是显示它,因为它被用来选择一个答案和另一个显示数据。表格更快。谢谢你。也学到了很多!向上投票!它并不比另一个基本解决方案快:
matrix(如.numeric(unlist(strsplit)(unlist(strsplit(b$b,“,TRUE)),“,”,TRUE))),ncol=2,byrow=TRUE)
。我经常使用data.table,并错误地认为它在各个方面都很有效:(
   lat      long    
   <chr>    <chr>   
 1 47.83006 11.71699
 2 47.83004 11.71691
 3 47.83002 11.7168 
 4 47.83001 11.71662
 5 47.83001 11.71662
 6 47.82993 11.71628
 7 47.82991 11.7162 
 8 47.82988 11.71614
 9 47.82983 11.71609
10 47.8295  11.71588
library(microbenchmark)
library(data.table)
microbenchmark(
    base = read.csv(text=unlist(strsplit(b$b, " ", TRUE)), col.names = c("Lat", "Lon")),
    data.table = as.data.table(tstrsplit(unlist(strsplit(b$b, ' ', T)), ',', T))
)
 # Unit: microseconds
 #       expr     min       lq     mean   median       uq     max neval
 #       base 354.102 360.5485 377.0983 371.2665 380.6985 527.916   100
 # data.table 151.252 161.8555 177.9840 178.1130 184.3945 348.759   100