R 在多个列中拆分完整地址列
我有一个具有以下列结构的数据帧(总共超过1000行): 该列包含位置、街道、门牌号、邮政编码、城市和国家。我想用R将列addressfull拆分为多个列,例如:R 在多个列中拆分完整地址列,r,split,tidyr,stringr,R,Split,Tidyr,Stringr,我有一个具有以下列结构的数据帧(总共超过1000行): 该列包含位置、街道、门牌号、邮政编码、城市和国家。我想用R将列addressfull拆分为多个列,例如: street house number zip city country molengraaf 20 1689 GL Utrecht Netherlands winkellaan 67 5788 BG Ams
street house number zip city country
molengraaf 20 1689 GL Utrecht Netherlands
winkellaan 67 5788 BG Amsterdam Netherlands
vermeerstraat 18 0932 DC Rotterdam Netherlands
na na na na na
Zandhorstlaan 122 0823 GT Ochtrup Germany
我已经阅读了tidyr和stringr文档。我可以看到(通过“)”、“|从位置x”和“,”进行拆分的模式。但我无法找到将列拆分为多个列的正确代码
有人能帮我吗 对于基本R方法,您可以使用
sub
对其进行强制:
df$steet <- sub("^(\\S+)\\s+.*$", "\\1", df$adressfull)
df$`house number` <- sub("^\\S+\\s+(\\d+).*$", "\\1", df$adressfull)
df$zip <- sub("^\\S+\\s+\\d+,\\s*(\\d+\\s+[A-Z]+).*$", "\\1", df$adressfull)
df$city <- sub("^.*?(\\S+),\\s*\\S+$", "\\1", df$adressfull)
df$country <- sub("^.*,\\s*(\\S+)$", "\\1", df$adressfull)
df
adressfull steet house number zip
1 molengraaf 20, 1689 GL Utrecht, Netherlands molengraaf 20 1689 GL
city country
1 Utrecht Netherlands
df$steet这将是解决问题的一种简单方法:
library(tidyverse)
df <- structure(list(addressfull = structure(c(3L, 5L, 4L, 2L, 1L), .Label = c("POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map",
"POINT(2.915206183 24.315583523)||DEF_32||--||13||map", "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map",
"POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map",
"POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map"
), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
df %>% separate(addressfull, sep = "\\|\\|", into = c("Coords", "DEF", "ADDRESS"),extra = "drop") %>%
select(ADDRESS) %>%
separate(ADDRESS, sep = ",", into = c("street", "city", "country")) %>%
separate(street, sep = "(?= \\d)", into = c("street", "house_number")) %>%
separate(city, sep = "(?<=[A-Z][A-Z])", into = c("zip", "city"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [4].
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
#> street house_number zip city country
#> 1 molengraaf 20 1689 GL Utrecht Netherlands
#> 2 winkellaan 67 5788 BG Amsterdam Netherlands
#> 3 vermeerstraat 18 0932 DC Rotterdam Netherlands
#> 4 -- <NA> <NA> <NA> <NA>
#> 5 Zandhorstlaan 122 0823 GT Ochtrup Germany
库(tidyverse)
df%separate(addressfull,sep=“\\\\\\\\\\\\\\”,into=c(“坐标”、“定义”、“地址”),extra=“drop”)%>%
选择(地址)%>%
分开(地址,sep=“,”,进入=c(“街道”、“城市”、“国家”))%>%
分开(街道,sep=“(?=\\d)”,进入=c(“街道”,“房屋号”))%>%
分离(城市,sep=“(?Astringr
解决方案如下:
addresssplit <- data.frame(
street = str_extract(addressfull$addressfull, "(?<=DEF_\\d{2}\\|\\|)\\w+\\b"),
number = str_extract(addressfull$addressfull, "\\d{1,}(?=,)"),
zip = str_extract(addressfull$addressfull, "(?<=\\s)\\d{4}\\s[A-Z]{2}"),
city = str_extract(addressfull$addressfull, "(?<=\\d{4}\\s[A-Z]{2}\\s)\\w+"),
country = str_extract(addressfull$addressfull, "(?<=[a-z]\\b,\\s)\\w+\\b")
)
addresssplit欢迎来到StackOverflow!请阅读有关和如何提供帮助的信息。这将使其他人更容易帮助您。查看tidyr
separate()
df <- data.frame(adressfull=c("molengraaf 20, 1689 GL Utrecht, Netherlands"),
stringsAsFactors=FALSE)
text <- "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map"
addresfull <- unlist(strsplit(text, "\\|\\|"))[3]
addresfull
[1] "molengraaf 20, 1689 GL Utrecht, Netherlands"
library(tidyverse)
df <- structure(list(addressfull = structure(c(3L, 5L, 4L, 2L, 1L), .Label = c("POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map",
"POINT(2.915206183 24.315583523)||DEF_32||--||13||map", "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map",
"POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map",
"POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map"
), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
df %>% separate(addressfull, sep = "\\|\\|", into = c("Coords", "DEF", "ADDRESS"),extra = "drop") %>%
select(ADDRESS) %>%
separate(ADDRESS, sep = ",", into = c("street", "city", "country")) %>%
separate(street, sep = "(?= \\d)", into = c("street", "house_number")) %>%
separate(city, sep = "(?<=[A-Z][A-Z])", into = c("zip", "city"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [4].
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
#> street house_number zip city country
#> 1 molengraaf 20 1689 GL Utrecht Netherlands
#> 2 winkellaan 67 5788 BG Amsterdam Netherlands
#> 3 vermeerstraat 18 0932 DC Rotterdam Netherlands
#> 4 -- <NA> <NA> <NA> <NA>
#> 5 Zandhorstlaan 122 0823 GT Ochtrup Germany
addresssplit <- data.frame(
street = str_extract(addressfull$addressfull, "(?<=DEF_\\d{2}\\|\\|)\\w+\\b"),
number = str_extract(addressfull$addressfull, "\\d{1,}(?=,)"),
zip = str_extract(addressfull$addressfull, "(?<=\\s)\\d{4}\\s[A-Z]{2}"),
city = str_extract(addressfull$addressfull, "(?<=\\d{4}\\s[A-Z]{2}\\s)\\w+"),
country = str_extract(addressfull$addressfull, "(?<=[a-z]\\b,\\s)\\w+\\b")
)
addresssplit
street number zip city country
1 molengraaf 20 1689 GL Utrecht Netherlands
2 winkellaan 67 5788 BG Amsterdam Netherlands
3 vermeerstraat 18 0932 DC Rotterdam Netherlands
4 <NA> <NA> <NA> <NA> <NA>
5 Zandhorstlaan 122 0823 GT Ochtrup Germany
addressfull <- structure(list(addressfull = structure(c(3L, 5L, 4L, 2L, 1L), .Label = c("POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map",
"POINT(2.915206183 24.315583523)||DEF_32||--||13||map", "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map",
"POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map",
"POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map"
), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))