Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/362.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
通过部分匹配r中的另一个数据帧来子集一个数据帧(对python/pandas解决方案开放)_Python_R_Pandas_Dataframe_Subset - Fatal编程技术网

通过部分匹配r中的另一个数据帧来子集一个数据帧(对python/pandas解决方案开放)

通过部分匹配r中的另一个数据帧来子集一个数据帧(对python/pandas解决方案开放),python,r,pandas,dataframe,subset,Python,R,Pandas,Dataframe,Subset,基本问题描述: df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), B = c(NA,"b1", "b1", "b2", "b1",NA), C = c(NA,NA,NA,NA,"c1","c1"), D = c(NA,NA,NA,NA,"d1","d1"), stringsAsFactors = FALSE) # c

基本问题描述:

df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
             B = c(NA,"b1", "b1", "b2", "b1",NA), 
             C = c(NA,NA,NA,NA,"c1","c1"),
             D = c(NA,NA,NA,NA,"d1","d1"),
             stringsAsFactors = FALSE)

# column D is not nessecary I imputed it to get a data frame when applying is.na() below

df_match <- data.frame(A= "a1", 
                       B = "b1", 
                       C = NA,
                       D = NA,
                   stringsAsFactors = FALSE)


     A    B    C    D
1   a1 <NA> <NA> <NA>
2   a1   b1 <NA> <NA>
3   a2   b1 <NA> <NA>
4 <NA>   b2 <NA> <NA>
5   a1   b1   c1   d1
6   a1 <NA>   c1   d1

> df_match
   A  B  C  D
1 a1 b1 NA NA
df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
             B = c(NA,"b1", "b1", "b2", "b1",NA), 
             C = c(NA,NA,NA,NA,"c1","c1"),
             D = c(NA,NA,NA,NA,"d1","d1"),
             stringsAsFactors = FALSE)

# column D is not nessecary I imputed it to get a data frame when applying is.na() below

df_match <- data.frame(A= "a1", 
                       B = "b1", 
                       C = NA,
                       D = NA,
                   stringsAsFactors = FALSE)


library(dplyr)

# create a boolean vector for condition 2
not_matchable <- names(df_match)[is.na(df_match)]
bol_no_matchable <- df %>%
      select(one_of(not_matchable)) %>%
      is.na() %>%
      apply(X = ., MARGIN = 1, any)

# create a boolean vector for condition 1
matchable <- names(df_match)[!is.na(df_match)]
bol_matchable <- sapply(1:nrow(df), function(row)
    {
    df[row,matchable] != df_match[,matchable]
  }) %>%
    apply(X = ., MARGIN = 2, FUN = any)

bol_matchable[is.na(bol_matchable)] <- FALSE 

# filter the results
df <- df %>%
   filter(!bol_matchable & bol_no_matchable)
df
设为数据帧,将
df\u匹配设为一行数据帧

我想对
df
进行子集划分,以便只保留非NA值包含在
df\u match
的非NA值中的行

一个简单的例子:

df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
             B = c(NA,"b1", "b1", "b2", "b1",NA), 
             C = c(NA,NA,NA,NA,"c1","c1"),
             D = c(NA,NA,NA,NA,"d1","d1"),
             stringsAsFactors = FALSE)

# column D is not nessecary I imputed it to get a data frame when applying is.na() below

df_match <- data.frame(A= "a1", 
                       B = "b1", 
                       C = NA,
                       D = NA,
                   stringsAsFactors = FALSE)


     A    B    C    D
1   a1 <NA> <NA> <NA>
2   a1   b1 <NA> <NA>
3   a2   b1 <NA> <NA>
4 <NA>   b2 <NA> <NA>
5   a1   b1   c1   d1
6   a1 <NA>   c1   d1

> df_match
   A  B  C  D
1 a1 b1 NA NA
df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
             B = c(NA,"b1", "b1", "b2", "b1",NA), 
             C = c(NA,NA,NA,NA,"c1","c1"),
             D = c(NA,NA,NA,NA,"d1","d1"),
             stringsAsFactors = FALSE)

# column D is not nessecary I imputed it to get a data frame when applying is.na() below

df_match <- data.frame(A= "a1", 
                       B = "b1", 
                       C = NA,
                       D = NA,
                   stringsAsFactors = FALSE)


library(dplyr)

# create a boolean vector for condition 2
not_matchable <- names(df_match)[is.na(df_match)]
bol_no_matchable <- df %>%
      select(one_of(not_matchable)) %>%
      is.na() %>%
      apply(X = ., MARGIN = 1, any)

# create a boolean vector for condition 1
matchable <- names(df_match)[!is.na(df_match)]
bol_matchable <- sapply(1:nrow(df), function(row)
    {
    df[row,matchable] != df_match[,matchable]
  }) %>%
    apply(X = ., MARGIN = 2, FUN = any)

bol_matchable[is.na(bol_matchable)] <- FALSE 

# filter the results
df <- df %>%
   filter(!bol_matchable & bol_no_matchable)
第3行和第4行在a列或B列中有一个错误条目

第5列和第6列中包含的值在
df_match
中不受支持(即在df_match中具有非NA值的列)

问题:

df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
             B = c(NA,"b1", "b1", "b2", "b1",NA), 
             C = c(NA,NA,NA,NA,"c1","c1"),
             D = c(NA,NA,NA,NA,"d1","d1"),
             stringsAsFactors = FALSE)

# column D is not nessecary I imputed it to get a data frame when applying is.na() below

df_match <- data.frame(A= "a1", 
                       B = "b1", 
                       C = NA,
                       D = NA,
                   stringsAsFactors = FALSE)


     A    B    C    D
1   a1 <NA> <NA> <NA>
2   a1   b1 <NA> <NA>
3   a2   b1 <NA> <NA>
4 <NA>   b2 <NA> <NA>
5   a1   b1   c1   d1
6   a1 <NA>   c1   d1

> df_match
   A  B  C  D
1 a1 b1 NA NA
df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
             B = c(NA,"b1", "b1", "b2", "b1",NA), 
             C = c(NA,NA,NA,NA,"c1","c1"),
             D = c(NA,NA,NA,NA,"d1","d1"),
             stringsAsFactors = FALSE)

# column D is not nessecary I imputed it to get a data frame when applying is.na() below

df_match <- data.frame(A= "a1", 
                       B = "b1", 
                       C = NA,
                       D = NA,
                   stringsAsFactors = FALSE)


library(dplyr)

# create a boolean vector for condition 2
not_matchable <- names(df_match)[is.na(df_match)]
bol_no_matchable <- df %>%
      select(one_of(not_matchable)) %>%
      is.na() %>%
      apply(X = ., MARGIN = 1, any)

# create a boolean vector for condition 1
matchable <- names(df_match)[!is.na(df_match)]
bol_matchable <- sapply(1:nrow(df), function(row)
    {
    df[row,matchable] != df_match[,matchable]
  }) %>%
    apply(X = ., MARGIN = 2, FUN = any)

bol_matchable[is.na(bol_matchable)] <- FALSE 

# filter the results
df <- df %>%
   filter(!bol_matchable & bol_no_matchable)
  • 我可以遵循哪些一般原则来提高子集问题的性能
  • 如何提高上述代码的性能
  • 关于我的实际问题,我如何改进下面代码的性能
问题: 在应用程序中,数据框
df
有一列
X
,其中
df
的值不受
df\u match
的支持。(见下文)

应用基本最小示例中的逻辑,我当前的解决方案如下:

df <- data.frame(A = c("a1", "a1", "a2", NA, "a1", "a1"), 
                 B = c(NA,"b1", "b1", "b2", "b1",NA), 
                 C = c("c2",NA,"c1",NA,"c1","c1"),
                 D = c(NA,"d2","d2","d2","d1","d1"),
                 X = c("C","D","C","D","D","C"),
                 stringsAsFactors = FALSE)

bol <- sapply(1:nrow(df), function(x)
{
  # determine value in column X
  X <- pull(df[x,], "X")
  not_matchable <- setdiff(matchable, X)
  # create boolean vector for condition 1)
  bol_no_matchable <- df[x,] %>%
    select(one_of(not_matchable)) %>%
    is.na() %>%
    all()

  # create boolean vector for condition 2)
  bol_matchable <- {df[x,not_matchable] != df_match[,not_matchable]} 
  bol_matchable[is.na(bol_matchable)] <- FALSE
  bol_matchable <- any(bol_matchable)

  # combine both conditions
  bol <- !bol_matchable & bol_no_matchable
})

df您可以
df
df\u match
的列上映射
,如果
df
的对应元素为
NA
或等于
df\u match
的元素,则为每个列对返回一个元素为
TRUE
的向量。然后选择
TRUE
s(由
rowsumes
生成)的行数等于列数的行(即,所有列要么匹配,要么不匹配)

注:如果
df_match
值为
NA
,而
df
值为非
NA
,则
Map
输出的相应向量元素将为
NA
,这与
行和
NA.rm=TRUE

row_matches <- 
  rowSums(mapply(function(x, y)  is.na(x) | x == y, df, df_match),  na.rm = TRUE)

df[row_matches == ncol(df),]
#    A    B    C    D
# 1 a1 <NA> <NA> <NA>
# 2 a1   b1 <NA> <NA>

row_匹配据我所知,这是一个
R
代码。为什么要标记
python
pandas
?如果您对python/pandas解决方案持开放态度,您应该在问题中这样说。谢谢您的评论!我改了标题。python/pandas是这个论坛的一个建议,由于缺乏经验,我毫无疑问地补充了这个建议。