在R中使用正则表达式从字符串中获取数字_R_Regex

在R中使用正则表达式从字符串中获取数字

r regex

在R中使用正则表达式从字符串中获取数字,r,regex,R,Regex,所以正则表达式是我一直在努力学习的东西/从来没有花适当的时间学习。在本例中，我有一个R向量，其中包含以下格式的棒球数据： hit_vector = c("", "Batted ball speed 104 mph; distance of 381 feet; launch angle of 38 degrees.", "Ball was hit at 67 mp

所以正则表达式是我一直在努力学习的东西/从来没有花适当的时间学习。在本例中，我有一个R向量，其中包含以下格式的棒球数据：

hit_vector = c("", "Batted ball speed <b>104 mph</b>; distance of <b>381 
feet</b>; launch angle of <b>38 degrees</b>.", 
"Ball was hit at <b>67 mph</b>.", "", "Ball left the bat at <b>107 mph</b> and traveled a distance of <b>412 feet</b>.", 
"Batted ball speed <b>71 mph</b>.", "Ball left the bat at <b>94 mph</b> and traveled a distance of <b>287 feet</b>.", 
"", "", "Batted ball speed <b>64 mph</b>.")  

> hit_vector
 [1] ""                                                                                                       
 [2] "Batted ball speed <b>104 mph</b>; distance of <b>381 feet</b>; launch angle of <b>38 degrees</b>."
 [3] "Ball was hit at <b>67 mph</b>."                                                                         
 [4] ""                                                                                                       
 [5] "Ball left the bat at <b>107 mph</b> and traveled a distance of <b>412 feet</b>."                        
 [6] "Batted ball speed <b>71 mph</b>."                                                                       
 [7] "Ball left the bat at <b>94 mph</b> and traveled a distance of <b>287 feet</b>."                         
 [8] ""                                                                                                       
 [9] ""                                                                                                       
[10] "Batted ball speed <b>64 mph</b>."

整个hit_向量要长得多，但它们似乎都遵循这个命名约定

编辑：下面的内容似乎有助于识别某些信息，但这些行的工作并不完美（第三行返回all FALSE，这是不正确的）：

编辑2：我不确定每个统计数据会有多少位数。例如，英里/小时可以超过100（3位），也可以小于10（1位）。

从

stringr

软件包中提取

stru

功能在这里应该很有用：

data.frame(
    speed=str_extract(hit_vector, "(\\d+)(?=\\s+mph)"),
    distance=str_extract(hit_vector, "(\\d+)(?=\\s+feet)"),
    degrees=str_extract(hit_vector, "(\\d+)(?=\\s+degrees)")
)

#    speed distance degrees
# 1   <NA>     <NA>    <NA>
# 2    104      381      38
# 3     67     <NA>    <NA>
# 4   <NA>     <NA>    <NA>
# 5    107      412    <NA>
# 6     71     <NA>    <NA>
# 7     94      287    <NA>
# 8   <NA>     <NA>    <NA>
# 9   <NA>     <NA>    <NA>
# 10    64     <NA>    <NA>

data.frame(
速度=str_提取（命中向量，（\\d+（=\\s+mph）），
距离=str_提取（命中向量，“（\\d+（=\\s+英尺）”），
度=str\u提取（命中向量，“（\\d+（=\\s+度）”）
)
#速度距离度
# 1            
# 2    104      381      38
# 3     67         
# 4            
# 5    107      412    
# 6     71         
# 7     94      287    
# 8            
# 9            
# 10    64

\\d

是数字的字符类，因此

\\d+

匹配一个或多个数字

（？=）

是一个零宽度的先行运算符，因此在本例中，它匹配后跟零个或多个空格字符（

\\s+

）和

mph

、

英尺

或

度的模式，而不捕获这些字符串。
使用基本r:
read.table(text=gsub("\\D+"," ",hit_vector),fill=T,blank.lines.skip = F)

    V1  V2 V3
1   NA  NA NA
2  104 381 38
3   67  NA NA
4   NA  NA NA
5  107 412 NA
6   71  NA NA
7   94 287 NA
8   NA  NA NA
9   NA  NA NA
10  64  NA NA

在这里，只需删除所有非数字的内容，即\\D+
，然后读取数据，使用FILL=T
且不跳过
考虑到您在下面的评论，我们需要重新整理我们的数据：
hit_vector1=c(hit_vector,"traveled a distance of <b>412 feet</b>.")

#Take the numbers together with their respective measurements.
a=gsub(".*?(\\d+).*?(mph|feet|degree).*?"," \\1 \\2",hit_vector1)

#Remove the </b>
b=sub("<[/]b>.","",a)

## Any element that does not contain the measurements, invoke an NA
fun=function(x){y=-grep(x,b);b<<-replace(b,y,paste(b[y],NA,x))}
invisible(sapply(c("mph","feet","degrees"),fun))

## Break the line after each measurement and read in a table format
e=gsub("([a-z])\\s","\\1\n",b)
unstack(read.table(text=e))
      degrees feet mph
1       NA   NA  NA
2       38  381 104
3       NA   NA  67
4       NA   NA  NA
5       NA  412 107
6       NA   NA  71
7       NA  287  94
8       NA   NA  NA
9       NA   NA  NA
10      NA   NA  64
11      NA  412  NA

hit_vector1=c（hit_vector，“移动了412英尺的距离”）
#将这些数字与其各自的测量值一起记录。
a=gsub（“.*？（\\d+）.*”（英里/小时英尺/度）。*？”，“\\1\\2”，点击向量1）
#移除
b=子（“.”，“，a）
##任何不包含测量值的元素都会调用NA
fun=函数（x）{y=-grep（x，b）；b如果你不介意用红墨水：
library(tidyverse)
tibble(x=hit_vector) %>%
  separate(x,c("speed","distance","degrees"),"</b>") %>%
  mutate_all(parse_number)

# # A tibble: 10 x 3
#    speed distance degrees
#    <dbl>    <dbl>   <dbl>
#  1    NA       NA      NA
#  2   104      381      38
#  3    67       NA      NA
#  4    NA       NA      NA
#  5   107      412      NA
#  6    71       NA      NA
#  7    94      287      NA
#  8    NA       NA      NA
#  9    NA       NA      NA
# 10    64       NA      NA

库（tidyverse）
TIBLE（x=命中向量）%>%
分离（x，c（“速度”、“距离”、“度”），”）%>%
全部变异（解析编号）
##tibble:10 x 3
#速度距离度
#           
#1NA NA NA
#  2   104      381      38
#367NA
#4娜娜娜娜
#5107412NA
#6 71 NA
#794287 NA
#8NA NA NA
#9娜娜娜娜
#1064NA
在baseR
中还有一个（使用）：
我对你的评价太高了。谢谢你的资源，真的很感激！你需要小心，第二排有104381和38@Onyambu谢谢，这是个好消息，我已经更新了answer@Jan谢谢，你第二次告诉我：）为了简洁起见接受了这个答案，但这个和下面的答案只使用了基本的r are都很好。谢谢。这很好，但是如果hit_向量中有一行只有距离（英尺）数据，比如“行驶了412英尺的距离”，这种方法会将412放在第一列（英里/小时），而不是第二列parse_number推送警告，删除所有非数字文本，并返回准确的数字。绝妙提示！
hit_vector1=c(hit_vector,"traveled a distance of <b>412 feet</b>.")

#Take the numbers together with their respective measurements.
a=gsub(".*?(\\d+).*?(mph|feet|degree).*?"," \\1 \\2",hit_vector1)

#Remove the </b>
b=sub("<[/]b>.","",a)

## Any element that does not contain the measurements, invoke an NA
fun=function(x){y=-grep(x,b);b<<-replace(b,y,paste(b[y],NA,x))}
invisible(sapply(c("mph","feet","degrees"),fun))

## Break the line after each measurement and read in a table format
e=gsub("([a-z])\\s","\\1\n",b)
unstack(read.table(text=e))
      degrees feet mph
1       NA   NA  NA
2       38  381 104
3       NA   NA  67
4       NA   NA  NA
5       NA  412 107
6       NA   NA  71
7       NA  287  94
8       NA   NA  NA
9       NA   NA  NA
10      NA   NA  64
11      NA  412  NA

library(tidyverse)
tibble(x=hit_vector) %>%
  separate(x,c("speed","distance","degrees"),"</b>") %>%
  mutate_all(parse_number)

# # A tibble: 10 x 3
#    speed distance degrees
#    <dbl>    <dbl>   <dbl>
#  1    NA       NA      NA
#  2   104      381      38
#  3    67       NA      NA
#  4    NA       NA      NA
#  5   107      412      NA
#  6    71       NA      NA
#  7    94      287      NA
#  8    NA       NA      NA
#  9    NA       NA      NA
# 10    64       NA      NA

# list of patterns
patterns <- c("(\\d+)(?=\\s*mph)", "(\\d+)(?=\\s*feet)", "(\\d+)(?=\\s*degrees)")

results <- lapply(patterns, function(pattern) {
  unlist(lapply(hit_vector, function(item) {
    result <- as.numeric(regmatches(item, regexpr(pattern, item, perl = TRUE)))
    if (identical(result, numeric(0))) return(NA)
    else return(result)
  }))
})

# build the dataframe from the list
df <- as.data.frame(do.call(cbind, results))
colnames(df) <- c("speed", "distance", "degrees")

result <- lapply(hit_vector, function(string) {
  unlist(lapply(patterns, function(pattern) {
    result <- as.numeric(regmatches(string, regexpr(pattern, string, perl = TRUE)))
    if (identical(result, numeric(0))) return(NA)
    else return(result)
  }))
})

df <- as.data.frame(do.call(rbind, result2))
colnames(df) <- c("speed", "distance", "degrees", "raw")

   speed distance degrees
1     NA       NA      NA
2    104      381      38
3     67       NA      NA
4     NA       NA      NA
5    107      412      NA
6     71       NA      NA
7     94      287      NA
8     NA       NA      NA
9     NA       NA      NA
10    64       NA      NA