Regex 在字符串中查找模式并在R中连接唯一的部分

Regex 在字符串中查找模式并在R中连接唯一的部分,regex,r,string,Regex,R,String,具有两个字符串的简明程序s1和s2。如何才能仅连接这些字符串中唯一的部分,而保留非唯一部分。让/作为连接部件之间的分隔符。预期结果如下所示: s1 <- "very big house" s2 <- "very small house" some_function(s1,s2) "very big/small house" #// desired result. s1这里有一个解决方案: pasteMergePrefixAndSuffix <- function(vs1,v

具有两个字符串的简明程序
s1
s2
。如何才能仅连接这些字符串中唯一的部分,而保留非唯一部分。让
/
作为连接部件之间的分隔符。预期结果如下所示:

s1 <- "very big house"
s2 <- "very small house"

some_function(s1,s2)
"very big/small house" #// desired result.
s1这里有一个解决方案:

pasteMergePrefixAndSuffix <- function(vs1,vs2,sep=' ') {
    ## cycle string vectors to same length
    vsl <- max(length(vs1),length(vs2));
    vs1 <- rep(vs1,len=vsl);
    vs2 <- rep(vs2,len=vsl);
    ## precompute character splits
    ss1 <- strsplit(vs1,'');
    ss2 <- strsplit(vs2,'');
    ## iterate over each pair of strings
    sapply(seq_along(vs1),function(si) {
        s1 <- vs1[si];
        s2 <- vs2[si];
        s1l <- length(ss1[[si]]);
        s2l <- length(ss2[[si]]);
        slmin <- min(s1l,s2l);
        ## handle trivial case of exact equality
        if (s1==s2) return(s1);
        ## get prefix and suffix lengths
        if (slmin==0L) { ## empty string cannot result in a prefix or suffix
            pl <- sl <- 0L;
        } else {
            eq <- ss1[[si]][seq_len(slmin)]==ss2[[si]][seq_len(slmin)];
            pl <- if (all(eq)) slmin else if (eq[1L]==T) which(!eq)[1L]-1L else 0L;
            eq <- rev(ss1[[si]])[seq_len(slmin)]==rev(ss2[[si]])[seq_len(slmin)];
            sl <- if (all(eq)) slmin else if (eq[1L]==T) which(!eq)[1L]-1L else 0L;
        }; ## end if
        ## paste together prefix, sep-pasted middles, and suffix
        m1 <- substr(s1,pl+1L,s1l-sl);
        m2 <- substr(s2,pl+1L,s2l-sl);
        paste0(substr(s1,1L,pl),paste(sep=sep,m1,m2),substr(s1,s1l-sl+1L,s1l));
    });
}; ## end pasteMergePrefixAndSuffix()

pasteMergePrefixAndSuffix使用
paste(s1,s2,sep=“/”)
尝试粘贴(s1,s2,sep=“/”)
如果存在重复项,请使用
strsplit
拆分为多个部分,并将两者连接起来,获取<代码>唯一的< /代码>,然后<代码>粘贴< /代码>,从各种示例中不清楚,如果您认为“代码> Word < /Cube >为唯一的或<代码>字符< /代码>作为UNIQUE,还有其他的字符串规则吗?它们在数字和字母方面都有相同的格式吗?也就是说,如果字符串a看起来像
“\uuuuuu”
,那么另一个字符串会遵循相同的格式吗?@akrun unique是特许权序列。@konkked字符串的结构事先不知道:这是我面临的主要困难。有没有办法确定两个字符串中有多少前导字符和多少尾字符匹配?
pasteMergePrefixAndSuffix <- function(vs1,vs2,sep=' ') {
    ## cycle string vectors to same length
    vsl <- max(length(vs1),length(vs2));
    vs1 <- rep(vs1,len=vsl);
    vs2 <- rep(vs2,len=vsl);
    ## precompute character splits
    ss1 <- strsplit(vs1,'');
    ss2 <- strsplit(vs2,'');
    ## iterate over each pair of strings
    sapply(seq_along(vs1),function(si) {
        s1 <- vs1[si];
        s2 <- vs2[si];
        s1l <- length(ss1[[si]]);
        s2l <- length(ss2[[si]]);
        slmin <- min(s1l,s2l);
        ## handle trivial case of exact equality
        if (s1==s2) return(s1);
        ## get prefix and suffix lengths
        if (slmin==0L) { ## empty string cannot result in a prefix or suffix
            pl <- sl <- 0L;
        } else {
            eq <- ss1[[si]][seq_len(slmin)]==ss2[[si]][seq_len(slmin)];
            pl <- if (all(eq)) slmin else if (eq[1L]==T) which(!eq)[1L]-1L else 0L;
            eq <- rev(ss1[[si]])[seq_len(slmin)]==rev(ss2[[si]])[seq_len(slmin)];
            sl <- if (all(eq)) slmin else if (eq[1L]==T) which(!eq)[1L]-1L else 0L;
        }; ## end if
        ## paste together prefix, sep-pasted middles, and suffix
        m1 <- substr(s1,pl+1L,s1l-sl);
        m2 <- substr(s2,pl+1L,s2l-sl);
        paste0(substr(s1,1L,pl),paste(sep=sep,m1,m2),substr(s1,s1l-sl+1L,s1l));
    });
}; ## end pasteMergePrefixAndSuffix()
pasteMergePrefixAndSuffix('a','b');
## [1] "a b"
pasteMergePrefixAndSuffix('a','b','/');
## [1] "a/b"
s1 <- 'very big house'; s2 <- 'very small house'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "very big/small house"
s1 <- '1b'; s2 <- '2b'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "1/2b"
s1 <- 'a_1_b'; s2 <- 'a_2_b'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "a_1/2_b"
s1 <- 'ab'; s2 <- 'ab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "ab"
s1 <- 'xab'; s2 <- 'ab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "x/ab"
s1 <- 'ab'; s2 <- 'abx'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "ab/x"
s1 <- 'abx'; s2 <- 'ab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "abx/"
s1 <- 'ab'; s2 <- 'xab'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "/xab"
s1 <- ''; s2 <- 'x'; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "/x"
s1 <- 'x'; s2 <- ''; pasteMergePrefixAndSuffix(s1,s2,'/');
## [1] "x/"