使用正则表达式从HTML字符串中提取所需的属性值_Html_Regex_Extract_Word

使用正则表达式从HTML字符串中提取所需的属性值

html regex

使用正则表达式从HTML字符串中提取所需的属性值,html,regex,extract,word,Html,Regex,Extract,Word,我用一些元素（p、span、div等）从Distance API检索HTML字符串，其中一些元素具有数据时间、数据时区、数据电子邮件预览等属性。我需要属性数据电子邮件预览上的值，这些值是格式为此处输入代码的时间戳。这些值始终位于HTML字符串中的前两个span元素之间。HTML字符串示例： 2019-05-10T17:00:00Z→ 2019-05-10T20:00:00Z HackSpace_by_Sugar_Ray_Banister.jpg1596×771 993 KB 也许这能满足您的需

我用一些元素（

p、span、div

等）从Distance API检索HTML字符串，其中一些元素具有

数据时间、数据时区、数据电子邮件预览

等属性。我需要属性

数据电子邮件预览

上的值，这些值是格式为

此处输入代码

的时间戳。这些值始终位于HTML字符串中的前两个span元素之间。HTML字符串示例：

2019-05-10T17:00:00Z→ 2019-05-10T20:00:00Z

HackSpace_by_Sugar_Ray_Banister.jpg1596×771 993 KB

也许这能满足您的需要

（slighlty编辑以满足您的需要）

（？）（\d{4}-\d{2}-\d{2}T\d{2}\：\d{2}\：\d{2}Z）（？=）

将返回您所需的元素

您可以通过使用github上的HTML DOM库实现这一点，但我使用sourceforge在这个链接上下载

使用它如下

// Create DOM from URL or file
$html = file_get_html('http://www.google.com/');

// Find all images 
foreach($html->find('img') as $element) 
echo $element->src . '<br>';

// Find all links 
foreach($html->find('a') as $element) 
echo $element->href . '<br>';

如果您想使用preg_replace，这很容易，但会让人困惑，因为有很多值，所以输出将是许多日期，然后您必须对这些输出进行数组，这样才能在单行中查看每个日期，这样您就可以在VBA中导入数据库

Sub Extract2()

    Dim hDoc As MSHTML.HTMLDocument
    Dim hElem As MSHTML.HTMLGenericElement
    Dim sFile As String, lFile As Long
    Dim pat1 As String
    Dim sHtml As String
        strHtml = "c:\1.html"
               'read in the file
                lFile = FreeFile
                sFile = strDir & strHtml
                Open sFile For Input As lFile
                sHtml = Input$(LOF(lFile), lFile)

                'put into an htmldocument object
                Set hDoc = New MSHTML.HTMLDocument
                hDoc.body.innerHTML = sHtml

                Set dateBody = hDoc.getElementsByClassName("discourse-local-date")
                Date1 = dateBody(0).innerText
                Date2 = dateBody(1).innerText
                    MsgBox Date1 & " " & Date2
                'regex
                pat1 = ".*span.*>(.+?)<"
                Date1 = simpleRegex(sHtml, pat1, 0)
                Date2 = simpleRegex(sHtml, pat1, 1)
                    MsgBox Date1 & " " & Date2

End Sub

Sub-Extract2（）
将hDoc设置为MSHTML.HTMLDocument
将hElem设置为MSHTML.htmlgenericeElement
将sFile设置为字符串，将lFile设置为长文件
将pat1变暗为字符串
作为字符串的Dim sHtml
strHtml=“c:\1.html”
'读入文件
lFile=FreeFile
sFile=strDir和strHtml
打开sFile作为lFile输入
sHtml=Input$（LOF（lFile），lFile）
'放入htmldocument对象中
设置hDoc=New MSHTML.HTMLDocument
hDoc.body.innerHTML=sHtml
Set dateBody=hDoc.getElementsByClassName（“本地日期”）
Date1=dateBody（0）。innerText
Date2=dateBody（1）.innerText
MsgBox日期1&“”和日期2
“正则表达式
pat1=“.*span.*>（.+？）与此相关的两个问题是，它还返回来自数据电子邮件预览的日期，并且不包含Z添加Z字符只需几秒钟。您可以详细说明应该忽略哪些内容以及捕获哪些内容吗？您能分享您迄今为止尝试过的代码吗se是一个类似jsoup的库。
Sub Extract2()

    Dim hDoc As MSHTML.HTMLDocument
    Dim hElem As MSHTML.HTMLGenericElement
    Dim sFile As String, lFile As Long
    Dim pat1 As String
    Dim sHtml As String
        strHtml = "c:\1.html"
               'read in the file
                lFile = FreeFile
                sFile = strDir & strHtml
                Open sFile For Input As lFile
                sHtml = Input$(LOF(lFile), lFile)

                'put into an htmldocument object
                Set hDoc = New MSHTML.HTMLDocument
                hDoc.body.innerHTML = sHtml

                Set dateBody = hDoc.getElementsByClassName("discourse-local-date")
                Date1 = dateBody(0).innerText
                Date2 = dateBody(1).innerText
                    MsgBox Date1 & " " & Date2
                'regex
                pat1 = ".*span.*>(.+?)<"
                Date1 = simpleRegex(sHtml, pat1, 0)
                Date2 = simpleRegex(sHtml, pat1, 1)
                    MsgBox Date1 & " " & Date2

End Sub

Function simpleRegex(strInput As String, strPattern As String, sNr As Long)
    Dim regEx As New RegExp
    If strPattern <> "" Then
        With regEx
            .Global = True
            .MultiLine = True
            .IgnoreCase = True
            .Pattern = strPattern
        End With
        dfs = regEx.Test(strInput)
        If regEx.Test(strInput) Then
            Set sReg = regEx.Execute(strInput)
            simpleRegex = sReg(sNr).SubMatches(0)
        Else
            simpleRegex = "false"
        End If
    End If
End Function