将excel单元格中的HTML解析为多个单元格

将excel单元格中的HTML解析为多个单元格,excel,excel-formula,vba,Excel,Excel Formula,Vba,我的excel单元格中有以下看起来非常凌乱的数据,其中包含大约100行HTML标记: 以下是两个例子: <ul class=""list-unstyled""> <li><span title=""Website"" class=""glyphicon glyphicon-link text-gray""></span> <a href=""https://google.org/"" target=""_blank"">Webs

我的excel单元格中有以下看起来非常凌乱的数据,其中包含大约100行HTML标记:

以下是两个例子:

<ul class=""list-unstyled"">
    <li><span title=""Website"" class=""glyphicon glyphicon-link text-gray""></span> <a href=""https://google.org/"" target=""_blank"">Website</a></li>
    <li><span title=""Website"" class=""glyphicon glyphicon-link text-gray""></span> <a href=""https://www.google.com/"" target=""_blank"">Website 2</a></li>
    <li><span title=""Product"" class=""glyphicon glyphicon-search text-gray""></span> <a href=""http://amazon.com"" target=""_blank"">Product</a></li>
    <li><span title=""Product"" class=""glyphicon glyphicon-search text-gray""></span> <a href=""https://amazon.de/"" target=""_blank"">Product 2</a></li>          
    <li><span title=""Tags"" class=""glyphicon glyphicon glyphicon-tag text-gray""></span>
        <small><span class=""label label-warning"">Available</span></small>
        <small><span class=""label label-warning"">Country</span></small>
    </li>
</ul>
老实说,我不知道如何应对这一挑战

你有什么建议吗?

我有一些想法:

如果在所有VBA中执行此操作(不使用任何库),则可以将html作为字符串搜索并查找
,然后对href&url执行更多子字符串搜索

另一种选择是使用正则表达式。看起来VBA脚本DLL具有正则表达式功能,您可以查看一下

最后一个选项是HTML敏捷包。这是为处理HTML而设计的。我曾经在一个.net项目中使用过它。我现在不记得细节了,但我记得它很容易使用。

Sub-splithtml()
Sub splithtml()
Dim htmlstring As String
Dim rowcount As Integer
Dim website1str As String, website2str As String, website3str As String
Dim product1str As String, product2str As String
Dim spanstr As String

'All the Attribute Nodes to be extracted are hardcoded

 website1str = ">Website</a></li>"
 website2str = ">Website 2</a></li>"
 website3str = ">Website 3</a></li>"
 product1str = ">Product</a></li>"
 product2str = ">Product 2</a></li>"
 spanstr = "</span></small>"

'Create Headers for the xml parsed table
 Cells(1, 2).Value = "Website 1"
 Cells(1, 3).Value = "Website 2"
 Cells(1, 4).Value = "Website 3"
 Cells(1, 5).Value = "Product 1"
 Cells(1, 6).Value = "Product 2"
 Cells(1, 7).Value = "Available"
 Cells(1, 8).Value = "Country"

'Get the number of rows with data in A column
'Assmption:- XML data stored in A column of the sheet

rowcount = Cells(Rows.Count, 1).End(xlUp).row

For i = 2 To rowcount + 1
'Xml is stored in A column and starts from second row, First row is assumed to be header
 htmlstring = Cells(i, 1).Value
'Parses each node and stores in the adjacent column of the column where XML is stored

   htmlstring = GetValue(htmlstring, website1str, i, 2)

   htmlstring = GetValue(htmlstring, website2str, i, 3)

   htmlstring = GetValue(htmlstring, website3str, i, 4)

   htmlstring = GetValue(htmlstring, product1str, i, 5)

   htmlstring = GetValue(htmlstring, product2str, i, 6)

   htmlstring = GetValue(htmlstring, spanstr, i, 7)

   htmlstring = GetValue(htmlstring, spanstr, i, 8)


Next i
End Sub



Function Trimhtml(Mainhtml, Processedhtml)
'Function to  Trim the HTMl nodes that has been parsed
 Dim spanstr As String
 spanstr = "</span></small>"
     Trimhtml = Mainhtml
    If Processedhtml = spanstr Then
      Trimhtml = Mid(Mainhtml, InStr(Mainhtml, Processedhtml) + 15)
    Else
      Trimhtml = Mid(Mainhtml, InStr(Mainhtml, Processedhtml))
    End If
End Function


Function GetValue(Mainhtml, nodevalue, row, column)
'Function to Get Text value from the attribute passed and stored in the row, column passed
 Dim nodestring As String
 Dim FirstPoint As Integer, Secondpoint As Integer
 Dim spanstr As String
 spanstr = "</span></small>"

  If InStr(Mainhtml, nodevalue) > 0 Then
     nodestring = Left$(Mainhtml, InStr(Mainhtml, nodevalue))
     If nodevalue = spanstr Then
       FirstPoint = InStrRev(nodestring, ">")
       Secondpoint = InStrRev(nodestring, "<")
       Returnvalue = Mid(nodestring, FirstPoint + 1, Secondpoint - FirstPoint - 1)
      Else
        FirstPoint = InStr(nodestring, "<a href=")
        Secondpoint = InStr(nodestring, "target=")
        Returnvalue = Mid(nodestring, FirstPoint + 10, Secondpoint - FirstPoint - 13)
       End If
    Cells(row, column).Value = Returnvalue
    GetValue = Trimhtml(Mainhtml, nodevalue)
   Else
    GetValue = Mainhtml
  End If
End Function
作为字符串的Dim htmlstring 将行计数设置为整数 Dim website1str为字符串,website2str为字符串,website3str为字符串 Dim product1str作为字符串,product2str作为字符串 作为字符串的Dim spanstr '所有要提取的属性节点都是硬编码的 网站1str=“>网站” website2str=“>网站2” website3str=“>网站3” product1str=“>产品” product2str=“>产品2” spanstr=“” '为xml解析表创建标题 单元格(1,2)。Value=“网站1” 单元格(1,3)。Value=“网站2” 单元格(1,4)。Value=“网站3” 单元格(1,5)。Value=“产品1” 单元格(1,6)。Value=“产品2” 单元格(1,7)。Value=“可用” 单元格(1,8)。Value=“国家” '获取列中包含数据的行数 '假设:-存储在工作表列中的XML数据 rowcount=单元格(Rows.Count,1).End(xlUp).row 对于i=2到行计数+1 'Xml存储在列中,从第二行开始,第一行假定为标题 htmlstring=单元格(i,1).值 '解析每个节点并存储在存储XML的列的相邻列中 htmlstring=GetValue(htmlstring,网站1STR,i,2) htmlstring=GetValue(htmlstring,网站2STR,i,3) htmlstring=GetValue(htmlstring,网站3STR,i,4) htmlstring=GetValue(htmlstring,product1str,i,5) htmlstring=GetValue(htmlstring,product2str,i,6) htmlstring=GetValue(htmlstring,spanstr,i,7) htmlstring=GetValue(htmlstring,spanstr,i,8) 接下来我 端接头 函数Trimhtml(Mainhtml、Processedhtml) '函数来修剪已解析的HTMl节点 作为字符串的Dim spanstr spanstr=“” Trimhtml=Mainhtml 如果Processedhtml=spanstr,则 Trimhtml=Mid(Mainhtml,InStr(Mainhtml,Processedhtml)+15) 其他的 Trimhtml=Mid(Mainhtml,InStr(Mainhtml,Processedhtml)) 如果结束 端函数 函数GetValue(Mainhtml、nodevalue、行、列) '函数从传递的属性中获取文本值,该属性存储在传递的行和列中 像弦一样的暗淡的结线 将第一个点设置为整数,将第二个点设置为整数 作为字符串的Dim spanstr spanstr=“” 如果InStr(Mainhtml,nodevalue)>0,则 nodestring=Left$(Mainhtml,InStr(Mainhtml,nodevalue)) 如果nodevalue=spanstr,则 FirstPoint=InStrRev(nodestring,“>”)
Secondpoint=instrev(nodestring,“方法是:创建函数,该函数将
HTML
code作为字符串作为参数,并返回与表头键相同的字典。函数体为:

Function ParseHTML(str As String) As Scripting.Dictionary
Set ParseHTML = New Scripting.Dictionary

Dim txt As String
Dim website As Long: website = 0
Dim product As Long: product = 0
Dim i As Long: i = 0

Do While True

    'get all text between <li> and <\li> tags
    'then extract all data from it: title attribute and link
    txt = Mid(str, InStr(1, str, "<li>") + 4, InStr(1, str, "</li>") - InStr(1, str, "<li>") - 4)
    'select which case it is: website, product or tags
    Select Case Mid(txt, InStr(1, txt, "title") + 8, InStr(1, txt, "class") - InStr(1, txt, "title") - 11)
        Case Is = "Website"
            website = website + 1
            'here you extract the link
            ParseHTML.Add "Website " & website, Mid(txt, InStr(1, txt, "<a href") + 10, InStr(1, txt, "target") - InStr(1, txt, "<a href") - 13)
        Case Is = "Product"
            product = product + 1
            'here you extract the link
            ParseHTML.Add "Product " & product, Mid(txt, InStr(1, txt, "<a href") + 10, InStr(1, txt, "target") - InStr(1, txt, "<a href") - 13)
        Case Is = "Tags"
            'if we reached Tags, then all websites are over and need different processing
            Exit Do
    End Select
    'delete processed text
    str = Mid(str, InStr(1, str, "</li>") + 5)

Loop

'since in your table you have 3 places for websites and products, so we need to add them
For i = website + 1 To 3
    ParseHTML.Add "Website " & i, ""
Next i
For i = product + 1 To 3
    ParseHTML.Add "Product " & i, ""
Next i

'now txt is the interior of last <li></li> tag and now we focus on what is
'between <small> and </small> tags
'also we don't need str variable anymore, so we can re-use it
str = Mid(txt, InStr(1, txt, "<small>") + 7, InStr(1, txt, "</small>") - InStr(1, txt, "<small>") - 7)
ParseHTML.Add "Available", Mid(str, InStr(1, str, ">") + 1, Len(str) - InStr(1, str, ">") - 7)
'remove processed part of html
txt = Mid(txt, InStr(1, txt, "</small>") + 8)
'take care of last <small> tag
str = Mid(txt, InStr(1, txt, "<small>") + 7, InStr(1, txt, "</small>") - InStr(1, txt, "<small>") - 7)
ParseHTML.Add "Country", Mid(str, InStr(1, str, ">") + 1, Len(str) - InStr(1, str, ">") - 7)


End Function
它适用于这样安排的表格(已填写):

在列中包含这些标题非常重要

重要

在运行任何操作之前,请在VBA编辑器中转到:
工具->引用,在弹出的窗口中,您需要选择Microsoft脚本运行时。

您可以在Excel中执行此操作,如果这是您需要的

首先,使用文本到列来解析数据

  • 在文本到列中,选择Delimited并点击next
  • 取消选中“分隔符”下的所有框,然后选中“其他”
  • 选中“其他”,然后在文本框中输入双引号
  • 完成
  • 复制以
  • 开头的行(仅复制数据,而不是整行)
  • 在电子表格中的其他位置粘贴特殊内容并检查转置
  • 删除空白行
    希望这就是您所寻找的

    假设您的数据位于单元格A2中,并且您正在单元格B2中为您可以使用以下公式的网站应用公式

        =IF((LEN($A2)-LEN(SUBSTITUTE($A2,"""""Website""""","")))/(LEN("Website")+4)>=COLUMNS($B$1:B1),TRIM(MID(SUBSTITUTE(SUBSTITUTE($A2,"<a href=""""",REPT(" ",LEN($A2)),COLUMNS($B$1:B1)),""""" target",REPT(" ",LEN($A2)),COLUMNS($B$1:B1)),LEN($A2),LEN($A2))),"")
    

    =IF((LEN($A2)-LEN(SUBSTITUTE($A2),)/(LEN(“Website”)+4)>=COLUMNS($B$1:B1),TRIM(MID)(SUBSTITUTE($A2),)

    谢谢您的回复!请您给出一个简短的示例,老实说,我对vba没有太深的了解……请您将代码放入函数中。谢谢!请将代码放入vba函数中。我的excel目前大约有45列。也适用于正在寻找此类解决方案的其他人nction对他们来说更容易使用。谢谢!你必须解析吗?你愿意通过节点/子节点循环吗?另外,我假设你已经通过分配
    Doc
    对象达到了这一点?
    Function ParseHTML(str As String) As Scripting.Dictionary
    Set ParseHTML = New Scripting.Dictionary
    
    Dim txt As String
    Dim website As Long: website = 0
    Dim product As Long: product = 0
    Dim i As Long: i = 0
    
    Do While True
    
        'get all text between <li> and <\li> tags
        'then extract all data from it: title attribute and link
        txt = Mid(str, InStr(1, str, "<li>") + 4, InStr(1, str, "</li>") - InStr(1, str, "<li>") - 4)
        'select which case it is: website, product or tags
        Select Case Mid(txt, InStr(1, txt, "title") + 8, InStr(1, txt, "class") - InStr(1, txt, "title") - 11)
            Case Is = "Website"
                website = website + 1
                'here you extract the link
                ParseHTML.Add "Website " & website, Mid(txt, InStr(1, txt, "<a href") + 10, InStr(1, txt, "target") - InStr(1, txt, "<a href") - 13)
            Case Is = "Product"
                product = product + 1
                'here you extract the link
                ParseHTML.Add "Product " & product, Mid(txt, InStr(1, txt, "<a href") + 10, InStr(1, txt, "target") - InStr(1, txt, "<a href") - 13)
            Case Is = "Tags"
                'if we reached Tags, then all websites are over and need different processing
                Exit Do
        End Select
        'delete processed text
        str = Mid(str, InStr(1, str, "</li>") + 5)
    
    Loop
    
    'since in your table you have 3 places for websites and products, so we need to add them
    For i = website + 1 To 3
        ParseHTML.Add "Website " & i, ""
    Next i
    For i = product + 1 To 3
        ParseHTML.Add "Product " & i, ""
    Next i
    
    'now txt is the interior of last <li></li> tag and now we focus on what is
    'between <small> and </small> tags
    'also we don't need str variable anymore, so we can re-use it
    str = Mid(txt, InStr(1, txt, "<small>") + 7, InStr(1, txt, "</small>") - InStr(1, txt, "<small>") - 7)
    ParseHTML.Add "Available", Mid(str, InStr(1, str, ">") + 1, Len(str) - InStr(1, str, ">") - 7)
    'remove processed part of html
    txt = Mid(txt, InStr(1, txt, "</small>") + 8)
    'take care of last <small> tag
    str = Mid(txt, InStr(1, txt, "<small>") + 7, InStr(1, txt, "</small>") - InStr(1, txt, "<small>") - 7)
    ParseHTML.Add "Country", Mid(str, InStr(1, str, ">") + 1, Len(str) - InStr(1, str, ">") - 7)
    
    
    End Function
    
    Sub ProcessHTML()
    'determine last row in A column
    Dim lastRow As Long: lastRow = Cells(Rows.Count, 1).End(xlUp).Row
    Dim dict As Scripting.Dictionary
    Dim i As Long
    Dim j As Long
    
    For i = 2 To lastRow
        'parse HTML code with our function
        Set dict = ParseHTML(Cells(i, 1).Value)
        For j = 2 To 9
            'write corresponding values from dictionary to cells in table
            Cells(i, j).Value = dict(Cells(1, j).Value)
        Next j
        'get rid of object
        Set dict = Nothing
    Next i
    End Sub
    
        =IF((LEN($A2)-LEN(SUBSTITUTE($A2,"""""Website""""","")))/(LEN("Website")+4)>=COLUMNS($B$1:B1),TRIM(MID(SUBSTITUTE(SUBSTITUTE($A2,"<a href=""""",REPT(" ",LEN($A2)),COLUMNS($B$1:B1)),""""" target",REPT(" ",LEN($A2)),COLUMNS($B$1:B1)),LEN($A2),LEN($A2))),"")
    
        =IF((LEN($A2)-LEN(SUBSTITUTE($A2,"""""Product""""","")))/(LEN("Product")+4)>=COLUMNS($E$1:E1),TRIM(MID(SUBSTITUTE(SUBSTITUTE(MID($A2,FIND("""""Product""""",$A2,1),LEN($A2)),"<a href=""""",REPT(" ",LEN($A2)),COLUMNS($E$1:E1)),""""" target",REPT(" ",LEN($A2)),COLUMNS($E$1:E1)),LEN($A2),LEN($A2))),"")