R 根据分隔符的计数分隔列中的字符串

R 根据分隔符的计数分隔列中的字符串,r,R,我正试图根据df中的分隔符(“_”)拆分第一列 第一个分隔符前的Ist字符串、第一个分隔符后的第二个字符串和最后一个分隔符前的最后一个字符串是连续的 根据上述声明 第_1行:根据分隔符和分隔符数量拆分的所有字符串为5 第2行:缺少分隔符2,3,4,因此值为空,分隔符的数量为2 第3行:缺少分隔符3,因此值为空,分隔符的数量为4 第4行:缺少分隔符2,3,因此值为空,分隔符的数量为3 我试过下面的代码 df$country<- sapply(strsplit(as.character(df$

我正试图根据df中的分隔符(“_”)拆分第一列

第一个分隔符前的Ist字符串、第一个分隔符后的第二个字符串和最后一个分隔符前的最后一个字符串是连续的
根据上述声明
第_1行:根据分隔符和分隔符数量拆分的所有字符串为5
第2行:缺少分隔符2,3,4,因此值为空,分隔符的数量为2
第3行:缺少分隔符3,因此值为空,分隔符的数量为4
第4行:缺少分隔符2,3,因此值为空,分隔符的数量为3

我试过下面的代码

df$country<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",1)
df$city<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",2)
df$gender<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",3)
df$age<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",4)
df$name<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",5)
df$state<- sapply(strsplit(as.character(df$country_city_gender_age_name_state), "_"), tail, 1)  

提前感谢

您可以尝试类似的方法,尽管这可能很难概括。如果国家、城市和州从未丢失,性别始终为“女性”或“男性”,并且字符串中的唯一数字与所需的“年龄”变量相关,则该方法有效。一些行当然可以在必要的情况下进行调整,而且,也许其中一些行可以稍微缩短。不管怎样,这有帮助吗

df = data.frame(country_city_gender_age_name_state, stringsAsFactors = F)

df$country <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), "[[", 1)
df$city <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), "[[", 2)
df$state <-  sapply(strsplit(df$country_city_gender_age_name_state, "_"), tail, 1)
df$gender <- ifelse(grepl("Female", df$country_city_gender_age_name_state), "Female", ifelse(grepl("Male", df$country_city_gender_age_name_state), "Male", NA))


df$age <- as.numeric(gsub("\\D", "", df$country_city_gender_age_name_state))                   

df$name <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), tail, 2)[1,]
df$name <- ifelse(sapply(1:nrow(df), function(x) df[x,"name", drop = FALSE] %in%  df[x,2:6]), NA, df$name)

> df[,c(1:3,5:7,4)]
  country_city_gender_age_name_state country       city gender age  name state
1         US_Dallas_Male_23_hanes_TX      US     Dallas   Male  23 hanes    TX
2                   US_LosAngeles_CA      US LosAngeles   <NA>  NA  <NA>    CA
3         US_Atlanta_Female_jenny_GA      US    Atlanta Female  NA jenny    GA
4                 US_Orlando_kane_FL      US    Orlando   <NA>  NA  kane    FL
df=data.frame(国家\城市\性别\年龄\姓名\州,stringsAsFactors=F)

df$country我不能提供一个漂亮的解决方案,但这个方案必须有效。您总是可以找到州、国家和城市的列表,因此只需设置ifelse条件来比较某个值是否在其中一个列表中。对于州名称,您可以处理2个字母的正则表达式条件或使用列表。在列表中找不到或由正则表达式匹配的值必须是name

library(dplyr)
dt <- as.data.table(df)

dt$number_of_entry <- rep(1:nrow(dt))
new_dt <- dt %>% mutate(country_city_gender_age_name_state = 
库(dplyr)

这很难做到。如何告诉计算机LosAngeles不是一个像“Jenny”这样的名字。可能的起始位置:
library(tidyverse);DF$$No.CyrdErthAgEngEnEnEnEx %%map(~ StruzRead(.x,)”%> %unt%%%TrimWS)< /COD>谢谢KATIN,以上数据是示例数据,我们不能把性别列看作常数,它可能是一个邮政编码栏,只是使用正则表达式来过滤它。
df = data.frame(country_city_gender_age_name_state, stringsAsFactors = F)

df$country <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), "[[", 1)
df$city <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), "[[", 2)
df$state <-  sapply(strsplit(df$country_city_gender_age_name_state, "_"), tail, 1)
df$gender <- ifelse(grepl("Female", df$country_city_gender_age_name_state), "Female", ifelse(grepl("Male", df$country_city_gender_age_name_state), "Male", NA))


df$age <- as.numeric(gsub("\\D", "", df$country_city_gender_age_name_state))                   

df$name <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), tail, 2)[1,]
df$name <- ifelse(sapply(1:nrow(df), function(x) df[x,"name", drop = FALSE] %in%  df[x,2:6]), NA, df$name)

> df[,c(1:3,5:7,4)]
  country_city_gender_age_name_state country       city gender age  name state
1         US_Dallas_Male_23_hanes_TX      US     Dallas   Male  23 hanes    TX
2                   US_LosAngeles_CA      US LosAngeles   <NA>  NA  <NA>    CA
3         US_Atlanta_Female_jenny_GA      US    Atlanta Female  NA jenny    GA
4                 US_Orlando_kane_FL      US    Orlando   <NA>  NA  kane    FL
library(dplyr)
dt <- as.data.table(df)

dt$number_of_entry <- rep(1:nrow(dt))
new_dt <- dt %>% mutate(country_city_gender_age_name_state = 
output <- data.frame(id = 1:nrow(dt),
                     country = rep(NA, nrow(dt)),
                     gender = rep(NA, nrow(dt)),
                     age = rep(NA, nrow(dt)),
                     name = rep(NA, nrow(dt)),
                     state = rep(NA, nrow(dt)))

number_of_entry = 1
for (i in nrow(new_dt)) {
  while (number_of_entry == new_dt$number_of_entry) {

    if (new_dt$country_city_gender_age_name_state[i] %in% list_of_countries) {
      output[number_of_entry,]$country = new_dt$country_city_gender_age_name_state[i]}
    else {

      if (new_dt$country_city_gender_age_name_state[i] %in% c("Female", "Male")) {
        output[number_of_entry,]$gender = new_dt$country_city_gender_age_name_state[i]}
      else {

        if (new_dt$country_city_gender_age_name_state[i] %in% list_of_cities) {
          output[number_of_entry,]$city = new_dt$country_city_gender_age_name_state[i]}
        else {
          if(grepl("\\d*", new_dt$country_city_gender_age_name_state[i])) {
            output[number_of_entry,]$age = new_dt$country_city_gender_age_name_state[i]}
          else {
            if (new_dt$country_city_gender_age_name_state[i] %in% list_of_states) {
              output[number_of_entry,]$state = new_dt$country_city_gender_age_name_state[i]}
            else {
              output[number_of_entry,]$name = new_dt$country_city_gender_age_name_state[i]}
            }
          }
         }
        }
      }
    }
  }
}