Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/67.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/regex/20.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
扩展gsub和grepl以忽略给定分隔符之间的子字符串_R_Regex_Gsub_Grepl - Fatal编程技术网

扩展gsub和grepl以忽略给定分隔符之间的子字符串

扩展gsub和grepl以忽略给定分隔符之间的子字符串,r,regex,gsub,grepl,R,Regex,Gsub,Grepl,我希望能够仅在给定的分隔符集之外使用grepl()和gsub(),例如,我希望能够忽略引号之间的文本 这是我想要的输出: grepl2("banana", "'banana' banana \"banana\"", escaped =c('""', "''")) #> [1] TRUE grepl2("banana", "'banana' apple \"banana\"", escaped =c('""', "''")) #> [1] FALSE grepl2("banana",

我希望能够仅在给定的分隔符集之外使用
grepl()
gsub()
,例如,我希望能够忽略引号之间的文本

这是我想要的输出:

grepl2("banana", "'banana' banana \"banana\"", escaped =c('""', "''"))
#> [1] TRUE
grepl2("banana", "'banana' apple \"banana\"", escaped =c('""', "''"))
#> [1] FALSE
grepl2("banana", "{banana} banana {banana}", escaped = "{}")
#> [1] TRUE
grepl2("banana", "{banana} apple {banana}", escaped = "{}")
#> [1] FALSE

gsub2("banana", "potatoe", "'banana' banana \"banana\"")
#> [1] "'banana' potatoe \"banana\""
gsub2("banana", "potatoe", "'banana' apple \"banana\"")
#> [1] "'banana' apple \"banana\""
gsub2("banana", "potatoe", "{banana} banana {banana}", escaped = "{}")
#> [1] "{banana} potatoe {banana}"
gsub2("banana", "potatoe", "{banana} apple {banana}", escaped = "{}")
#> [1] "{banana} apple {banana}"
实际案例可能引用了不同数量和顺序的子字符串

我已经编写了以下适用于这些情况的函数,但是它们很笨重,
gsub2()
一点也不健壮,因为它临时用占位符替换分隔的内容,并且这些占位符可能会受到后续操作的影响

regex_escape <-
function(string,n = 1) {
  for(i in seq_len(n)){
    string <- gsub("([][{}().+*^$|\\?])", "\\\\\\1", string)
  }
  string
}

grepl2 <- 
  function(pattern, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, 
           useBytes = FALSE, escaped =c('""', "''")){
    escaped <- strsplit(escaped,"")
    # TODO check that "escaped" delimiters are balanced and don't cross each other
    for(i in 1:length(escaped)){
      close <- regex_escape(escaped[[i]][[2]])
      open <- regex_escape(escaped[[i]][[1]])
      pattern_i <- sprintf("%s.*?%s", open, close)
      x <- gsub(pattern_i,"",x)
    }
    grepl(pattern, x, ignore.case, perl, fixed, useBytes)
  }

gsub2 <- function(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, 
                   fixed = FALSE, useBytes = FALSE, escaped =c('""', "''")){
  escaped <- strsplit(escaped,"")
  # TODO check that "escaped" delimiters are balanced and don't cross each other
  matches <- character()
  for(i in 1:length(escaped)){
    close <- regex_escape(escaped[[i]][[2]])
    open <- regex_escape(escaped[[i]][[1]])
    pattern_i <- sprintf("%s.*?%s", open, close)
    ind <- gregexpr(pattern_i,x)
    matches_i <- regmatches(x, ind)[[1]]
    regmatches(x, ind)[[1]] <- paste0("((",length(matches) + seq_along(matches_i),"))")
    matches <- c(matches, matches_i)
  }
  x <- gsub(pattern, replacement, x, ignore.case, perl, fixed, useBytes)
  for(i in seq_along(matches)){
    pattern <- sprintf("\\(\\(%s\\)\\)", i)
    x <- gsub(pattern, matches[[i]], x)
  }
  x
}

regex\u escape这里有一个简单的正则表达式解决方案,在character类中使用否定运算符。它只满足你的简单情况。我无法使其满足成对的多分隔符请求:

grepl2 <- function(patt, escape="'", arg=NULL) {
             grepl( patt=paste0("[^",escape,"]", 
                                patt,
                                "[^",escape,"]"), arg) }

grepl2("banana", "'banana' apple \"banana\"", escape =c( "'"))
#[1] TRUE

grepl2("banana", "'banana' apple ", escape =c( "'"))
[#1] FALSE

grepl2我的意见是,您可能需要将开始括号和结束括号分开,以使代码正常工作。
这里我使用了regex的lookaround特性。这可能不会在R之外普遍起作用(尤其是回望?<匹配运算符)

grepl2 = function(pattern, x, escapes = c(open="\"'{", close="\"'}")){
     grepl(paste0("(?<![", escapes[[1]], "])",
                  pattern, 
                  "(?![", escapes[[2]], "])"), 
           x, perl=T)
}
grepl2("banana", "'banana' banana \"banana\"")
#> [1] TRUE
grepl2("banana", "'banana' apple \"banana\"")
#> [1] FALSE
grepl2("banana", "{banana} banana {banana}")
#> [1] TRUE
grepl2("banana", "{banana} apple {banana}")
#> [1] FALSE
grepl2=function(模式,x,转义=c(open=“\”{,close=“\”}”){

grepl(paste0)((?我尝试了
grepl2
,但还没有找到
gsub2
的破解方法(或明确的解决方案)。无论如何,这只是删除了所有字符(不包括新行)在所提供的
转义的
字符的最短对之间。它也应该具有相当好的伸缩性。如果使用此解决方案,您可能需要进行检查,以确保存在不带空格的
转义的
字符对(或者适合使用
substr()
。希望这有帮助

grepl3 <- 
  function(pattern, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, 
           useBytes = FALSE, escaped =c('""', "''")){

    new_esc1 <- gsub("([][{}().+*^$|\\?])", "\\\\\\1", substr(escaped, 1, 1))
    new_esc2 <- gsub("([][{}().+*^$|\\?])", "\\\\\\1", substr(escaped, 2, 2))
    rm_pat <- paste0(new_esc1, ".*?", new_esc2, collapse = "|")
    new_arg <- gsub(rm_pat, "", arg)
    grepl(pattern, new_arg)

  }

grepl3(pattern = "banana", x = "'banana' apple \"banana\" {banana}", escaped =c("''", '""', "{}"))
[1] FALSE

grepl3您可以使用
start/end\u escape
参数来提供匹配分隔符的LHS和RHS,例如
{
}
,而无需在错误的位置进行匹配(
}
作为LHS分隔符)

perl=TRUE
允许环视断言。这些断言评估其中语句的有效性,而不在模式中捕获它们。非常好地涵盖了它们

perl=FALSE
中会出现错误,因为R的默认正则表达式引擎不支持它们

  gsub3 <- function(pattern, replacement, x, escape = NULL, start_escape = NULL, end_escape = NULL) {
      if (!is.null(escape) || !is.null(start_escape)) 
      left_escape <- paste0("(?<![", paste0(escape, paste0(start_escape, collapse = ""), collapse = ""), "])")
      if (!is.null(escape) || !is.null(end_escape))
      right_escape <- paste0("(?![", paste0(escape, paste0(end_escape, collapse = ""), collapse = ""), "])")
      patt <- paste0(left_escape, "(", pattern, ")", right_escape)
      gsub(patt, replacement, x, perl = TRUE)
    }
    gsub3("banana", "potatoe", "'banana' banana \"banana\"", escape = "'\"")
    #> [1] "'banana' potatoe \"banana\""
    gsub3("banana", "potatoe", "'banana' apple \"banana\"", escape = '"\'')
    #> [1] "'banana' apple \"banana\""
    gsub3("banana", "potatoe", "{banana} banana {banana}", escape = "{}")
    #> [1] "{banana} potatoe {banana}"
    gsub3("banana", "potatoe", "{banana} apple {banana}", escape = "{}")
    #> [1] "{banana} apple {banana}"
编辑:

这应该可以解决gsub,而不存在Andrew提到的问题,只要您对一组成对运算符没有问题。我认为您可以修改它以允许使用多个分隔符。感谢这个有趣的问题,在
regmatches
中发现了一个新的gem

gsub4 <-
  function(pattern,
           replacement,
           x,
           left_escape = "{",
           right_escape = "}") {
    # `regmatches()` takes a character vector and
    # output of `gregexpr` and friends and returns
    # the matching (or unmatching, as here) substrings
    string_pieces <-
      regmatches(x,
                 gregexpr(
                   paste0(
                     "\\Q",  # Begin quote, regex will treat everything after as fixed.
                     left_escape,
                     "\\E(?>[^", # \\ ends quotes.
                     left_escape,
                     right_escape,
                     "]|(?R))*", # Recurses, allowing nested escape characters
                     "\\Q",
                     right_escape,
                     "\\E",
                     collapse = ""
                   ),
                   x,
                   perl = TRUE
                 ), invert =NA) # even indices match pattern (so are escaped),
                                # odd indices we want to perform replacement on.
for (k in seq_along(string_pieces)) {
    n_pieces <- length(string_pieces[[k]])
  # Due to the structure of regmatches(invert = NA), we know that it will always
  # return unmatched strings at odd values, padding with "" as needed.
  to_replace <- seq(from = 1, to = n_pieces, by = 2)
  string_pieces[[k]][to_replace] <- gsub(pattern, replacement, string_pieces[[k]][to_replace])
}
    sapply(string_pieces, paste0, collapse = "")
  }
gsub4('banana', 'apples', "{banana's} potatoes {banana} banana", left_escape = "{", right_escape = "}")
#> [1] "{banana's} potatoes {banana} apples"
gsub4('banana', 'apples', "{banana's} potatoes {banana} banana", left_escape = "{", right_escape = "}")
#> [1] "{banana's} potatoes {banana} apples"
gsub4('banana', 'apples',  "banana's potatoes", left_escape = "{", right_escape = "}")
#> [1] "apples's potatoes"
gsub4('banana', 'apples', "{banana's} potatoes", left_escape = "{", right_escape = "}")
#> [1] "{banana's} potatoes"
gsub4[1]“{banana's}土豆”

您是否愿意使用
perl=TRUE
?是的,绝对地,在这种情况下,
gsub2(“香蕉”、“香蕉”、“{banana}banana{banana}”,转义=“{}”)
将导致
“{banana}potatoe{banana}"
。这就是否定字符集的问题。首先,我不清楚为什么这是错误的答案。而且……你确实有两个独立的请求。你应该将你的问题分为
grepl
版本和
gsub
版本。这会剥离内部香蕉周围的空间,因为你匹配的模式是回答“不是转义字符”。因此空格被匹配并被删除。如果你在字符集中包含空格,你将不会得到任何匹配。好吧。问两个问题。这是我的问题,不是他的:)。我认为这些问题有太多的共同点而无法分开。grepl更容易,但gsub解决方案可能也会解决grepl问题,而且grepl解决方案可以为gsub提供线索。因为我们使用的是正则表达式功能,只要将grepl改为gsub,它就会工作得很好。看起来很棒!我会在有时间检查它的时候马上打勾ver!只是提醒一下,对于
grepl3
,如果在开始/结束转义之间包含转义字符,则此解决方案可能会遇到问题。例如,如果文本中有人去了
香蕉的
(和
是转义字符之一。需要记住或更新。gsub解决方案的一个限制是,我们不能使用
“\\1”
作为替换。您可以使用它,但它会在每个未转义的字符串段上重置。您需要此选项的实际示例是什么?
gsub4 <-
  function(pattern,
           replacement,
           x,
           left_escape = "{",
           right_escape = "}") {
    # `regmatches()` takes a character vector and
    # output of `gregexpr` and friends and returns
    # the matching (or unmatching, as here) substrings
    string_pieces <-
      regmatches(x,
                 gregexpr(
                   paste0(
                     "\\Q",  # Begin quote, regex will treat everything after as fixed.
                     left_escape,
                     "\\E(?>[^", # \\ ends quotes.
                     left_escape,
                     right_escape,
                     "]|(?R))*", # Recurses, allowing nested escape characters
                     "\\Q",
                     right_escape,
                     "\\E",
                     collapse = ""
                   ),
                   x,
                   perl = TRUE
                 ), invert =NA) # even indices match pattern (so are escaped),
                                # odd indices we want to perform replacement on.
for (k in seq_along(string_pieces)) {
    n_pieces <- length(string_pieces[[k]])
  # Due to the structure of regmatches(invert = NA), we know that it will always
  # return unmatched strings at odd values, padding with "" as needed.
  to_replace <- seq(from = 1, to = n_pieces, by = 2)
  string_pieces[[k]][to_replace] <- gsub(pattern, replacement, string_pieces[[k]][to_replace])
}
    sapply(string_pieces, paste0, collapse = "")
  }
gsub4('banana', 'apples', "{banana's} potatoes {banana} banana", left_escape = "{", right_escape = "}")
#> [1] "{banana's} potatoes {banana} apples"
gsub4('banana', 'apples', "{banana's} potatoes {banana} banana", left_escape = "{", right_escape = "}")
#> [1] "{banana's} potatoes {banana} apples"
gsub4('banana', 'apples',  "banana's potatoes", left_escape = "{", right_escape = "}")
#> [1] "apples's potatoes"
gsub4('banana', 'apples', "{banana's} potatoes", left_escape = "{", right_escape = "}")
#> [1] "{banana's} potatoes"