Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/jquery-ui/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Winnow data.frame到包含列的原则文本?_R - Fatal编程技术网

Winnow data.frame到包含列的原则文本?

Winnow data.frame到包含列的原则文本?,r,R,我想找出一种合理的方法,从data.frames中检测并选择包含数据列的“原则”文本 这些列包含开放式调查响应,因此它们具有主要由字母字符组成的异构字符串。 理想情况下,这种方法可以 删除所有因子、数字、日期和逻辑列 删除填充稀疏的文本列 删除具有少数唯一元素的文本列 能够处理非标准字符 以下是我想要实现的原始示例: 原始输入数据 v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 1 Na Gu Rx L

我想找出一种合理的方法,从data.frames中检测并选择包含数据列的“原则”文本

这些列包含开放式调查响应,因此它们具有主要由字母字符组成的异构字符串。 理想情况下,这种方法可以

  • 删除所有因子、数字、日期和逻辑列
  • 删除填充稀疏的文本列
  • 删除具有少数唯一元素的文本列
  • 能够处理非标准字符
  • 以下是我想要实现的原始示例:

    原始输入数据

       v1   v2   v3   v4   v5 v6 v7 v8     v9 v10 v11 v12 v13   v14
    1  Na   Gu   Rx   Ll bird  a  a  1 88,626   1   1   ç   a  TRUE
    2  Ue   Ho   Iy <NA> bird  b  b  2 48,666   2   2   é   b FALSE
    3  Vk   Lv <NA> <NA> bird  a  c  3 12,559   3   1   ë   ç  TRUE
    4  Pd   Hk <NA> <NA> bird  b  d  4  3,794   4   2   õ   d FALSE
    5  Ay   Nd <NA> <NA> <NA>  a  e  5 75,239   5   1   ï   é  TRUE
    6  Xj <NA> <NA> <NA> <NA>  b  a  6 44,559   6   2   í   f FALSE
    7  Zn <NA> <NA> <NA> <NA>  a  b  7 21,100   7   1   ð   g  TRUE
    8  Mw <NA> <NA> <NA> <NA>  b  c  8  7,790   8   2   ø   h FALSE
    9  Yx <NA> <NA> <NA> <NA>  a  d  9 84,470   9   1   ö   i  TRUE
    10 Oj <NA> <NA> <NA> <NA>  b  e 10 45,724  10   2   ò   j FALSE
    
     k1 k2  k3  d1  d2      d3     d4   d5  d6  d7  d8      d9
    Ze  E,w h,Y c   bird   12,36    b   38  38  2   FALSE   18/03/2020
    Gr  Y,y w,J d   NA     88,510   b   54  54  2   FALSE   3/04/2020
    Ze  J,x w,G e   bird   26,932   b   30  30  2   FALSE   10/03/2020
    Nt  V,u a,A d   bird    8,660   a   19  19  1   TRUE    28/02/2020
    Bn  W,l z,O c   bird   19,684   a   3   3   1   TRUE    12/02/2020
    Km  L,c h,d a   bird    8,649   b   16  16  2   FALSE   25/02/2020
    Lx  N,e s,H d   NA     92,838   b   84  84  2   FALSE   3/05/2020
    Vv  R,s m,b e   bird   58,793   b   40  40  2   FALSE   20/03/2020
    El  A,h i,E d   bird   61,589   b   44  44  2   FALSE   24/03/2020
    Az  B,b n,é c   NA     45,11    b   58  58  2   FALSE   7/04/2020
    Tq  J,y w,N b   NA     81,288   b   82  82  2   FALSE   1/05/2020
    Lg  Z,l h,I c   NA     17,418   b   88  88  2   FALSE   7/05/2020
    Oh  F,b e,Q a   NA     28,887   a   71  71  1   TRUE    20/04/2020
    Rj  I,f ç,F b   NA     59,213   a   97  97  1   TRUE    16/05/2020
    Pw  X,u n,Z b   bird   51,622   b   42  42  2   FALSE   22/03/2020
    Dv  A,d l,X c   bird   29,148   b   18  18  2   FALSE   27/02/2020
    Hp  C,o w,Z d   bird   78,737   b   24  24  2   FALSE   4/03/2020
    Br  D,i b,a e   NA     35,231   a   65  65  1   TRUE    14/04/2020
    Re  X,g b,O b   NA     18,244   b   92  92  2   FALSE   11/05/2020
    
    校正的期望输出

    k1  k2  k3  
    Ze  E,w h,Y 
    Gr  Y,y w,J 
    Ze  J,x w,G 
    Nt  V,u a,A 
    Bn  W,l z,O 
    Km  L,c h,d 
    Lx  N,e s,H 
    Vv  R,s m,b 
    El  A,h i,E 
    Az  B,b n,é 
    Tq  J,y w,N 
    Lg  Z,l h,I 
    Oh  F,b e,Q 
    Rj  I,f ç,F 
    Pw  X,u n,Z 
    Dv  A,d l,X 
    Hp  C,o w,Z 
    Br  D,i b,a 
    Re  X,g b,O
    
    以下是补充输入数据的代码:

    # made-up data
    df <- data.frame(stringsAsFactors = F,
      v1 = paste0(sample(LETTERS, 10, replace = T), sample(letters, 10, replace = T)),
      v2 = c(paste0(sample(LETTERS, 5, replace = T), sample(letters, 5, replace = T)), rep(NA, 5)),
      v3 = c(paste0(sample(LETTERS, 2, replace = T), sample(letters, 2, replace = T)), rep(NA, 8)),
      v4 = c(paste0(sample(LETTERS, 1, replace = T), sample(letters, 1, replace = T)), rep(NA, 9)),
      v5 = c(rep("bird", 4), rep(NA, 6)),
      v6 = factor(rep(c("a", "b"), 5)),
      v7 = rep(c("a", "b", "c", "d", "e"),2),
      v8 = 1:10,
      v9 = paste0(sample(1:99, 10, replace =T), ",", sample(1:999, 10, replace =T)),
      v10 = as.character(1:10),
      v11 = factor(rep(c(1, 2), 5)),
      v12 = c('ç','é','ë','õ','ï','í','ð','ø','ö','ò'),
      v13 = c('a','b', 'ç','d',' é',letters[6:10]),
      v14 = as.logical(rep(c("TRUE", "FALSE"), 5)))
    
    set.seed(8)
    
    df <- data.frame(stringsAsFactors = F, 
      k1 = paste0(sample(LETTERS, 100, replace = T), sample(letters, 100, replace = T)),
      k2 = paste0(sample(LETTERS, 100, replace = T), ",", sample(letters, 100, replace = T)),
      k3 = paste0(sample(c('a','b', 'ç','d',' é',letters), 100, replace = T), ",",
        sample(c('a','b', 'ç','d',' é', LETTERS), 100, replace = T)),
      d1 = rep(c("a", "b", "c", "d", "e"),20),
      d2 = c(rep("bird", 51), rep(NA, 49)),
      d3 = paste0(sample(1:99, 100, replace =T), ",", sample(1:999, 100, replace =T)),
      d4 = factor(rep(c("a", "b"), 50)),
      d5 = 1:100,
      d6 = as.character(1:100),
      d7 = factor(rep(c(1, 2), 10)),
      d8 = as.logical(rep(c("TRUE", "FALSE"), 50)),
      d9 = seq(from = as.Date("2020-02-10"), to = as.Date("2020-02-10") + 99, by = 'day'))
    
    df <- df[sample(nrow(df)),]
    
    set.seed(8)
    
    df这可能会在您的真实数据中爆炸,但我认为它可以修补

    library(tidyverse)
    step_one <- df %>% 
      # change the commas for dots (may explode if you actually have commas)
      mutate_if(.predicate = function(x) is.character(x),
                .funs = function(x) stringr::str_replace_all(x, ",", ".")) %>%
      select_if(.predicate =  function(x) is.character(x) && is.na(as.numeric(x)) == TRUE && is.logical(x) == FALSE) %>% 
      mutate_all(.funs = function(x) stringi::stri_trans_general(x, "Latin-ASCII"))
    
    
    step_one %>%
      summarise_each(funs = function(x) sum(is.na(x))) %>%
      reshape2::melt() %>%
      mutate(variable = as.character(variable),
             total_cases = nrow(df),
             frac = value/total_cases,
    # --->>> arbitrary 0.5 threshold <<<<----
             sparse = ifelse(frac>0.5, "remove", "keep")) %>%
      filter(sparse == "keep") %>%
      pull(variable) -> variables_to_keep
    
    df %>% select(variables_to_keep)
    
    这样你就能得到你想要的

    step_two %>% select(remove_high_repetition_variables(.)
    

    请注意,“高度重复(如v5)”也很空……我没有寻找重复变量,这可能会在真实的数据集中失败。不过,您的答案会有所帮助。如果时间允许,我将发布一个更可靠的样本数据,并继续修补。我已经更新了我的问题。如果回答者愿意,我们可以回滚?删除d1和d2的标准是什么?
    c(“鸟”、“鸟”、“鸟”、“鸟”、NA、NA、NA、“猫”、“猫”、“鸟”)
    c是否会计数?它们有一个阈值问题,因此很难给出准确的解释。但我粗略地说,如果一个100个元素的向量只有20个或更少的唯一值,那么这些值更有可能是“标签”,而不是实际的开放式文本。因此,根据这个标准,如果你的例子以相同的比例重复到100个案例,它也会被删除
    set.seed(8)
    
    df <- data.frame(stringsAsFactors = F, 
      k1 = paste0(sample(LETTERS, 100, replace = T), sample(letters, 100, replace = T)),
      k2 = paste0(sample(LETTERS, 100, replace = T), ",", sample(letters, 100, replace = T)),
      k3 = paste0(sample(c('a','b', 'ç','d',' é',letters), 100, replace = T), ",",
        sample(c('a','b', 'ç','d',' é', LETTERS), 100, replace = T)),
      d1 = rep(c("a", "b", "c", "d", "e"),20),
      d2 = c(rep("bird", 51), rep(NA, 49)),
      d3 = paste0(sample(1:99, 100, replace =T), ",", sample(1:999, 100, replace =T)),
      d4 = factor(rep(c("a", "b"), 50)),
      d5 = 1:100,
      d6 = as.character(1:100),
      d7 = factor(rep(c(1, 2), 10)),
      d8 = as.logical(rep(c("TRUE", "FALSE"), 50)),
      d9 = seq(from = as.Date("2020-02-10"), to = as.Date("2020-02-10") + 99, by = 'day'))
    
    df <- df[sample(nrow(df)),]
    
    library(tidyverse)
    step_one <- df %>% 
      # change the commas for dots (may explode if you actually have commas)
      mutate_if(.predicate = function(x) is.character(x),
                .funs = function(x) stringr::str_replace_all(x, ",", ".")) %>%
      select_if(.predicate =  function(x) is.character(x) && is.na(as.numeric(x)) == TRUE && is.logical(x) == FALSE) %>% 
      mutate_all(.funs = function(x) stringi::stri_trans_general(x, "Latin-ASCII"))
    
    
    step_one %>%
      summarise_each(funs = function(x) sum(is.na(x))) %>%
      reshape2::melt() %>%
      mutate(variable = as.character(variable),
             total_cases = nrow(df),
             frac = value/total_cases,
    # --->>> arbitrary 0.5 threshold <<<<----
             sparse = ifelse(frac>0.5, "remove", "keep")) %>%
      filter(sparse == "keep") %>%
      pull(variable) -> variables_to_keep
    
    df %>% select(variables_to_keep)
    
       v1   v2 v7 v12 v13
    1  Bq   Um  a   ç   a
    2  Tb   Aq  b   é   b
    3  Wv   Cf  c   ë   ç
    4  Mf   Sl  d   õ   d
    5  Ou   Ah  e   ï   é
    6  Ag <NA>  a   í   f
    7  Rl <NA>  b   ð   g
    8  Mw <NA>  c   ø   h
    9  Kj <NA>  d   ö   i
    10 Bd <NA>  e   ò   j
    
    step_one %>% select(variables_to_keep)
       v1   v2 v7 v12 v13
    1  Bq   Um  a   c   a
    2  Tb   Aq  b   e   b
    3  Wv   Cf  c   e   c
    4  Mf   Sl  d   o   d
    5  Ou   Ah  e   i   e
    6  Ag <NA>  a   i   f
    7  Rl <NA>  b   d   g
    8  Mw <NA>  c   o   h
    9  Kj <NA>  d   o   i
    10 Bd <NA>  e   o   j
    
    remove_high_repetition_variables <- function(df){
      tallies <- df %>%
        purrr::map(function(tt) as.data.frame(table(tt))) %>%
        purrr::map(function(tt) mutate(tt,
                                      unique_values = length(tt),
                                      total = sum(Freq),
                                      highly_rep = total/unique_values,
                                      representation = Freq/total,
       # Your thresholds here
       # 100-element vector had only 20 unique values or less
                                      flag = ifelse(highly_rep > 100/20, "remove", "keep"))) %>%
      purrr::map(function (tt) any(pull(tt, flag) == "remove"))
    
     if (any(tallies == TRUE)){
       return(names(tallies[tallies == FALSE]))  
     } else {
       return(names(tallies))
     }
    
    }
    
    
    remove_high_repetition_variables(step_two)
    [1] "k1" "k2" "k3"
    remove_high_repetition_variables(mtcars)
    [1] "mpg"  "disp" "hp"   "drat" "wt"   "qsec"
    
    step_two %>% select(remove_high_repetition_variables(.)