R 如何重新排列两个数据帧之间的匹配顺序_R_List_Dataframe

R 如何重新排列两个数据帧之间的匹配顺序

r list dataframe

R 如何重新排列两个数据帧之间的匹配顺序,r,list,dataframe,R,List,Dataframe,从昨晚开始，我就一直忙于这个问题，我不知道该怎么做我要做的是将df1字符串与df2字符串匹配，并将相似的字符串取出我是这样做的 # a function to arrange the data to have IDs for each string normalize <- function(x, delim) { x <- gsub(")", "", x, fixed=TRUE) x <- gsub("(", "", x, fixed=T

从昨晚开始，我就一直忙于这个问题，我不知道该怎么做

我要做的是将df1字符串与df2字符串匹配，并将相似的字符串取出

我是这样做的

# a function to arrange the data to have IDs for each string 
    normalize <- function(x, delim) {
      x <- gsub(")", "", x, fixed=TRUE)
      x <- gsub("(", "", x, fixed=TRUE)
      idx <- rep(seq_len(length(x)), times=nchar(gsub(sprintf("[^%s]",delim), "", as.character(x)))+1)
      names <- unlist(strsplit(as.character(x), delim))
      return(setNames(idx, names))
    }

# a function to arrange the second df  
lookup <- normalize(df2[,1], ",")

# a function to match them and give the IDs
process <- function(s) {
  lookup_try <- lookup[names(s)]
  found <- which(!is.na(lookup_try))
  pos <- lookup_try[names(s)[found]]
  return(paste(s[found], pos, sep="-"))
  #change the last line to "return(as.character(pos))" to get only the result as in the comment
}

res <- lapply(colnames(df1), function(x) process(normalize(df1[,x], ";")))

> res
$s1
[1] "3-4" "4-1" "5-4"

$s2
[1] "2-4"  "3-15" "7-16"

第一列ID是与df1中的字符串匹配的df2的行号第二列No是匹配的次数第三列ID-col-n是df1中与该字符串+其列名匹配的字符串的行号第四个是df1第一列中与该字符串匹配的字符串第五列是与该字符串匹配的第二列的字符串

依此类推

在本例中，我发现在将数据合并到查找表之前，将数据切换为宽格式更容易

你可以试试：

library(tidyr)
library(dplyr)
df1_tmp <- df1
df2_tmp <- df2
#add numerical id to df1_tmp to keep row information
df1_tmp$id <- seq_along(df1_tmp[,1])

#switch to wide and unnest rows with several strings
df1_tmp <- gather(df1_tmp,key="s_val",value="query_string",-id)
df1_tmp <- df1_tmp %>% 
        mutate(query_string = strsplit(as.character(query_string), ";")) %>% 
        unnest(query_string)


df2_tmp$IDs. <- gsub("[()]", "", df2_tmp$IDs.)

#add numerical id to df1_tmp to keep row information
df2_tmp$id <- seq_along(df2_tmp$IDs.)

#unnest rows with several strings
df2_tmp <- df2_tmp %>% 
        mutate(IDs. = strsplit(as.character(IDs.), ",")) %>% 
        unnest(IDs.)

res <- merge(df1_tmp,df2_tmp,by.x="query_string",by.y="IDs.")

res$ID_col_n <- paste(paste0(res$id.x,res$s_val))
res$total_id <- 1:nrow(res)
res <- spread(res,s_val,value=query_string,fill=NA)
res
#summarize to get required output 

res <- res %>% group_by(id.y) %>%
        mutate(No=n())  %>% group_by(id.y,No) %>%
        summarise_each(funs(paste(.[!is.na(.)],collapse=","))) %>% 
        select(-id.x,-total_id)

colnames(res)[colnames(res)=="id.y"]<-"IDs"

res$df1_colMatch_counts <- rowSums(res[,-(1:3)]!="")
df2_counts <- df2_tmp %>% group_by(id) %>% summarize(df2_string_counts=n())
res <- merge(res,df2_counts,by.x="IDs",by.y="id")
res


res

  IDs No    ID_col_n            s1     s2 df1_colMatch_counts df2_string_counts
1   1  1         4s1        P41182                          1                 2
2   2  1         4s1        P41182                          1                 2
3   3  1         4s1        P41182                          1                 2
4   4  3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831                   2                 4
5  15  1         3s2               P54612                   1                 5
6  16  1         7s2               O15143                   1                 7

library（tidyr）
图书馆（dplyr）
df1_tmp对于这个解决方案，我使用了一个通用的helper函数，旨在绕过rbind（）
的限制，即它不能处理不一致的列名。您可能可以使用中的任何函数，但我也编写了自己的函数：
rbind.cn <- function(...,filler=NA) {
    ## note: must explicitly set proper S3 class; otherwise, filler would corrupt the column type in cases where a column is missing from the first df
    ## for example, logical NA (the default for filler) would nix factors, resulting in character after type promotion
    ## to do this, will use single-row temp df as first argument to rbind()
    ## note: tried zero-row to prevent need to excise afterward, but zero-row rbind() arguments are ignored for typing purposes
    l <- list(...);
    schema <- do.call(cbind,unname(lapply(l,function(df) df[1L,,drop=F]))); ## unname() is necessary, otherwise cbind() tries to be a good citizen and concats first df cell value found in lapply() names onto schema column names
    schema <- schema[unique(names(schema))];
    res <- do.call(rbind,c(list(schema),lapply(l,function(df) {
        cns.add <- names(schema)[!names(schema)%in%names(df)];
        do.call(cbind,c(
            list(df),
            setNames(rep(filler,length(cns.add)),cns.add),
            stringsAsFactors=F
        ));
    })))[-1L,,drop=F];
    ## fix up row names
    rns <- do.call(c,lapply(l,rownames));
    rownames(res) <- ifelse(grepl('^[0-9]+$',rns),seq_along(rns),rns);
    res;
};

以下是完整的解决方案：
## normalize both data.frames
df1.norm <- normalize(df1,';');
df2.norm <- normalize(df2,',');

## join them on matching ids
df.match <- merge(df1.norm,df2.norm,'id',suffixes=c('.1','.2'));
df.match <- df.match[with(df.match,order(ri.2,cn.1,ri.1)),]; ## order by df2 row index, df1 column name, and finally df1 row index, as per required output

## aggregate and format as required
res <- do.call(rbind.cn,c(by(df.match,df.match$ri.2,function(x) {
    strCols <- aggregate(id~cn.1,x[c('id','cn.1')],paste,collapse=','); ## conveniently, automatically orders by the grouping column cn.1
    do.call(cbind,c(
        list(data.frame(IDs=x$ri.2[1L],No=nrow(x),`ID-col-n`=paste0(x$ri.1,x$cn.1,collapse=','),stringsAsFactors=F,check.names=F)),
        setNames(strCols$id,paste0('string-df1-',strCols$cn.1)),
        stringsAsFactors=F
    ));
}),filler='-'));

## order string-df1 columns
res <- res[c(1:3,order(as.integer(sub('.*?([0-9]+)$','\\1',names(res)[-1:-3])))+3L)];




我想你最近的问题与此有关。这不能解决问题吗？看起来需要一些时间。我现在正忙于一个项目要交付。我真的很想帮助你。但是，我正忙于一个项目。因此，我只回答花费较少时间的问题。在normalize
中，您可以将两个正则表达式函数简化为gsub（“[（）]”），“，”，x）
在您的示例中，P41182
位于df2
的前三行，在df1
的第四行，s1列，那么，你的结果中不应该有一行2 1 4s1 P41182-
和2 1 4s1 P41182-
吗？是的，只要用-id
替换collect
中的s1:s2
就行了，现在工作。注意：这会更改df1
和df2
，因此您可能需要更改变量名称。它似乎会自行删除一些列。能否将第一列添加为每行df2的字符串数？因此，我可以看到它是否正确是的，我添加了No列，列没有被删除，只是dplyr的print函数在有太多列时不显示最后的列。您可以使用View（res）或转换回dataframe作为.data.frame（res）只要看一下ncol（res）
，它应该是730。我得到了这个错误df2.norm，我怀疑在您的实际数据中，df2
中有一个空字符串，这将导致data.frame（id=l[[ri]]，ri=ri，stringsAsFactors=F）
调用使l[[ri]]
为零行，而ri为一行，这将导致引用的错误。我已经添加了一个我相信会解决问题的防护装置。请尝试我的编辑，并让我知道它是如何进行的。错误已经解决，我在一个小数据上尝试了它，但列被覆盖了，因此我无法真正检查算法是否工作良好，例如string-df1-s10是第一个，然后是string-df1-s50，然后是string-df1-s29等等。我在解决方案代码示例的末尾添加了一个额外的订购语句。我很乐意帮助您满足这个额外的要求，但我恐怕现在没有时间继续解决这个问题。此外，这是您所请求的功能的一个复杂而显著的扩展，因此，提出一个新问题并提供涵盖该情况的新样本数据可能会更好。你写新问题时可以在新问题中引用这个问题。
## normalize both data.frames
df1.norm <- normalize(df1,';');
df2.norm <- normalize(df2,',');

## join them on matching ids
df.match <- merge(df1.norm,df2.norm,'id',suffixes=c('.1','.2'));
df.match <- df.match[with(df.match,order(ri.2,cn.1,ri.1)),]; ## order by df2 row index, df1 column name, and finally df1 row index, as per required output

## aggregate and format as required
res <- do.call(rbind.cn,c(by(df.match,df.match$ri.2,function(x) {
    strCols <- aggregate(id~cn.1,x[c('id','cn.1')],paste,collapse=','); ## conveniently, automatically orders by the grouping column cn.1
    do.call(cbind,c(
        list(data.frame(IDs=x$ri.2[1L],No=nrow(x),`ID-col-n`=paste0(x$ri.1,x$cn.1,collapse=','),stringsAsFactors=F,check.names=F)),
        setNames(strCols$id,paste0('string-df1-',strCols$cn.1)),
        stringsAsFactors=F
    ));
}),filler='-'));

## order string-df1 columns
res <- res[c(1:3,order(as.integer(sub('.*?([0-9]+)$','\\1',names(res)[-1:-3])))+3L)];

df1.norm;
##        id ri ci cn
## 1  Q9Y6W5  1  1 s1
## 2  Q9Y6U3  2  1 s1
## 3  Q9Y6Q9  3  1 s1
## 4  P41182  4  1 s1
## 5  Q9HCP0  4  1 s1
## 6  Q09472  5  1 s1
## 7  Q9Y6I3  6  1 s1
## 8  Q9Y6H1  7  1 s1
## 9  Q5T1J5  7  1 s1
## 10 Q16835  1  2 s2
## 11 P61809  2  2 s2
## 12 Q92831  2  2 s2
## 13 P41356  3  2 s2
## 14 P54612  3  2 s2
## 15 A41PH2  3  2 s2
## 16 P3R117  4  2 s2
## 17 P31908  5  2 s2
## 18 P54112  6  2 s2
## 19 O15143  7  2 s2

df2.norm;
##        id ri ci   cn
## 1  P41182  1  1 IDs.
## 2  P56524  1  1 IDs.
## 3  P41182  2  1 IDs.
## 4  Q9UQL6  2  1 IDs.
## 5  P41182  3  1 IDs.
## 6  Q8WUI4  3  1 IDs.
## 7  Q92793  4  1 IDs.
## 8  Q09472  4  1 IDs.
## 9  Q9Y6Q9  4  1 IDs.
## 10 Q92831  4  1 IDs.
## 11 P30561  5  1 IDs.
## 12 P53762  5  1 IDs.
## 13 Q15021  6  1 IDs.
## 14 Q9BPX3  6  1 IDs.
## 15 Q15003  6  1 IDs.
## 16 O95347  6  1 IDs.
## 17 Q9NTJ3  6  1 IDs.
## 18 Q92902  7  1 IDs.
## 19 Q9NQG7  7  1 IDs.
## 20 Q969F9  8  1 IDs.
## 21 Q9UPZ3  8  1 IDs.
## 22 Q86YV9  8  1 IDs.
## 23 Q92903  9  1 IDs.
## 24 Q96NY9  9  1 IDs.
## 25 Q91VB4 10  1 IDs.
## 26 P59438 10  1 IDs.
## 27 Q8BLY7 10  1 IDs.
## 28 Q92828 11  1 IDs.
## 29 Q13227 11  1 IDs.
## 30 O15379 11  1 IDs.
## 31 O75376 11  1 IDs.
## 32 O60907 11  1 IDs.
## 33 Q9BZK7 11  1 IDs.
## 34 P78537 12  1 IDs.
## 35 Q6QNY1 12  1 IDs.
## 36 Q6QNY0 12  1 IDs.
## 37 Q9NUP1 12  1 IDs.
## 38 Q96EV8 12  1 IDs.
## 39 Q8TDH9 12  1 IDs.
## 40 Q9UL45 12  1 IDs.
## 41 O95295 12  1 IDs.
## 42 O55102 13  1 IDs.
## 43 Q9CWG9 13  1 IDs.
## 44 Q5U5M8 13  1 IDs.
## 45 Q8VED2 13  1 IDs.
## 46 Q91WZ8 13  1 IDs.
## 47 Q8R015 13  1 IDs.
## 48 Q9R0C0 13  1 IDs.
## 49 Q9Z266 13  1 IDs.
## 50 P30561 14  1 IDs.
## 51 O08915 14  1 IDs.
## 52 P07901 14  1 IDs.
## 53 P11499 14  1 IDs.
## 54 Q8WMR7 15  1 IDs.
## 55 P67776 15  1 IDs.
## 56 P11493 15  1 IDs.
## 57 P54612 15  1 IDs.
## 58 P54613 15  1 IDs.
## 59 P61160 16  1 IDs.
## 60 P61158 16  1 IDs.
## 61 O15143 16  1 IDs.
## 62 O15144 16  1 IDs.
## 63 O15145 16  1 IDs.
## 64 P59998 16  1 IDs.
## 65 O15511 16  1 IDs.

df.match;
##       id ri.1 ci.1 cn.1 ri.2 ci.2 cn.2
## 2 P41182    4    1   s1    1    1 IDs.
## 4 P41182    4    1   s1    2    1 IDs.
## 3 P41182    4    1   s1    3    1 IDs.
## 8 Q9Y6Q9    3    1   s1    4    1 IDs.
## 6 Q09472    5    1   s1    4    1 IDs.
## 7 Q92831    2    2   s2    4    1 IDs.
## 5 P54612    3    2   s2   15    1 IDs.
## 1 O15143    7    2   s2   16    1 IDs.

res;
##   IDs No    ID-col-n string-df1-s1 string-df1-s2
## 1   1  1         4s1        P41182             -
## 2   2  1         4s1        P41182             -
## 3   3  1         4s1        P41182             -
## 4   4  3 3s1,5s1,2s2 Q9Y6Q9,Q09472        Q92831
## 5  15  1         3s2             -        P54612
## 6  16  1         7s2             -        O15143