如何控制/解析位于HTML头上方的数组

如何控制/解析位于HTML头上方的数组,html,excel,vba,web-scraping,Html,Excel,Vba,Web Scraping,很难解析在中找到的HTML 响应标头为: 内容类型:text/html 内容长度:28078 我尝试了以下方法: 获取html表并按行、节点或元素进行解析。这三个都能用。html很笨拙,而且有随机空格,没有ID,只有类名。解析是一种痛苦 我看到html上面有一个表格,我不确定是什么数据格式。如何获取/解析此数据 将我的代码放在html下面,忽略底部部分,我已经编码/分解了几十种不同的尝试,包括将文本/html加载到xml中并使用它 "<!--mod v2--> <!--ALL

很难解析在中找到的HTML

响应标头为: 内容类型:text/html 内容长度:28078

我尝试了以下方法:

获取html表并按行、节点或元素进行解析。这三个都能用。html很笨拙,而且有随机空格,没有ID,只有类名。解析是一种痛苦

我看到html上面有一个表格,我不确定是什么数据格式。如何获取/解析此数据

将我的代码放在html下面,忽略底部部分,我已经编码/分解了几十种不同的尝试,包括将文本/html加载到xml中并使用它

"<!--mod v2-->
<!--ALL INQUIRY TYPE: BXS1PRA3-->
<!--AllInquiryType :: BXS1PRA3
ECBIn :: 
AllBin :: 1054664
AllCount :: 0001
AllBorough :: 
BoilerNumber :: 
OpenObjectionFlag :: 
BiswebReporting :: 
InternalFlag :: 
BoroughKey :: I
StCodeKey :: 
BldgLoKey :: 
JobSubmDate :: 
AllNumbHous :: 
AllStrt :: 
AllBoroughName :: 
AllIsn :: 
PassWorkOrderNumber :: 
PassJobNumber :: 
PassDocNumber :: 
SeqNumber :: 
PPremise60 :: 
PassRecordNumber :: 
RqidPriorToA :: 
RqidPriorToP :: 
RqidPriorToS :: 
RqidItemStatusAll :: 
RqidItemStatusOpen :: 
RqidMultiLineFirst :: 
RqidMultiLineAll :: 
RqidIndex :: 
NotUsed :: 
StartFloorSc52 :: 
JobTypeDesc :: 
PassDeviceNumber :: 
NextIndex :: 
EMPTYPARAMCANBEUSED :: 
AllLicNoCurrent :: 
AllLicTypeCurrent :: 
Empty1 :: 
AllCnNumber :: 
AllCdNumber :: 
ApplNumOcv5 :: 
PageNumber :: 
PfKey :: 
AllEmailAddrCurrent :: 
Empty2 :: 
StartActiveSelect :: 
AllControlNumber :: 
AllStartDate :: 
AllEndDate :: 
AllJobType :: 
AllCommBd :: 
AllViolationType :: 
AllIsn2 :: 
AllTblType :: 
AllBlock :: 
AllLot :: 
AllTblCode :: 
TblBusinessName :: 
AllJAppProfTitle :: 
AllJAppLicNumber :: 
AllMetrixId :: 
InPassword :: 
InUserId :: 
NavFlag :: 
STypeOcv3 :: 
PtTempStatus :: 
PtOtherAuthApproval :: 
PtOtherAuthSig :: 
FillerData :: 
PassTempJobNumber :: 
AllKey1 :: 
AllKey2 :: 
AllFilterLarge :: 
AllFileId :: 
AllMemoType :: 
AllNumOfDataLines :: 
ReadSw :: D
FinFlag :: 
VbLoginId :: 
SustainableFlag :: 
-->
<!--Fin :: 0
ErrorMsg :: 
MoreErrors :: 
MFErrorArray ::  ARRAY[2 * 120]
[1]
    [0:ErrorCode]{ }
    [1:ArrayIndex]{ }
MFErrorArray2 ::  ARRAY[3 * 60]
[1]
    [0:ErrorCode2]{ }
    [1:Substitution]{ }
    [2:ArrayIndex2]{ }
NotUsed :: 
AllControlNumber :: 07/30/1
Datu :: 8
Pgm :: BXS1PRA3
VlNumbHous :: 2421
NmStrt :: 2 AVENUE
NmBoro :: MANHATTAN
VlBin :: 1054664
VlNumbZip :: 10035
VlTaxBlock :: 01789
VlTaxLot :: 00024
VlCensTract :: 242
VlHlthArea :: 1700
HseLo :: 
HseHi :: 
GlJobType :: 
GlPageN :: 0001
GlRecCountN :: 0000000008
FoilIndicator :: 
GlMax :: 
DebugMsg :: 
BoroughName :: 
NumbHous :: 
Strt :: 
TransactionExecuted :: BXS1PRA3
Lines ::  ARRAY[22 * 40]
[1]
    [0:Pra3Isn]{0000564806}
    [1:Fd]{12062006}
    [2:Job]{104619478}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{001}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{12112006}
    [11:Applicant]{DEL MAST}
    [12:Rep]{}
    [13:Jobdes]{INSTALLATION OF A SCAFFOLD 16&#039; LONG X 38&#039; HIGH ON THE EXISTING SIDEWALK SH}
    [14:JAppLicNumberDisp]{OT}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[2]
    [0:Pra3Isn]{0000555722}
    [1:Fd]{09212006}
    [2:Job]{104550629}
    [3:Ap]{01}
    [4:JobType]{A2}
    [5:Demo]{}
    [6:FlrInjq]{001,002,003,004,005}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{09212006}
    [11:Applicant]{Shapiro}
    [12:Rep]{}
    [13:Jobdes]{Filing herewith to make building structurally stable inconjunction with de}
    [14:JAppLicNumberDisp]{0060597 PE}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[3]
    [0:Pra3Isn]{0000520307}
    [1:Fd]{02092006}
    [2:Job]{104294096}
    [3:Ap]{01}
    [4:JobType]{DM}
    [5:Demo]{}
    [6:FlrInjq]{001}
    [7:Gas]{}
    [8:Js]{X}
    [9:Jobstatus]{SIGNED OFF}
    [10:Dt]{02022007}
    [11:Applicant]{JACOBSON}
    [12:Rep]{}
    [13:Jobdes]{}
    [14:JAppLicNumberDisp]{1788510 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[4]
    [0:Pra3Isn]{0000462054}
    [1:Fd]{07192004}
    [2:Job]{103835735}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{GRD}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{08192005}
    [11:Applicant]{SINGH}
    [12:Rep]{}
    [13:Jobdes]{CONSTRUCTION OF 65 LF HEAVY DUTY SIDEWAL K SHED WITH NOSTORAGE  AS PER P}
    [14:JAppLicNumberDisp]{}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[5]
    [0:Pra3Isn]{0000184027}
    [1:Fd]{06121997}
    [2:Job]{101534190}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{                         001 thru 005}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{07011997}
    [11:Applicant]{KO}
    [12:Rep]{}
    [13:Jobdes]{FIRE ESCAPES TO REPLACE EXISTING FIRE BALCONY.}
    [14:JAppLicNumberDisp]{0011493 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[6]
    [0:Pra3Isn]{0000010982}
    [1:Fd]{10041990}
    [2:Job]{100121823}
    [3:Ap]{01}
    [4:JobType]{A2}
    [5:Demo]{}
    [6:FlrInjq]{1,  CLR}
    [7:Gas]{}
    [8:Js]{P}
    [9:Jobstatus]{APPROVED}
    [10:Dt]{10121990}
    [11:Applicant]{ESHKAR}
    [12:Rep]{}
    [13:Jobdes]{WORK AT NEW LAUNDROMAT ON FIRST FLOOR. N EW WASHERS, DRYERS, NEW HUNG}
    [14:JAppLicNumberDisp]{0018190 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[7]
    [0:Pra3Isn]{0000010981}
    [1:Fd]{10041990}
    [2:Job]{100121814}
    [3:Ap]{01}
    [4:JobType]{A2}
    [5:Demo]{}
    [6:FlrInjq]{1}
    [7:Gas]{}
    [8:Js]{J}
    [9:Jobstatus]{P/E DISAPPROVED}
    [10:Dt]{05091991}
    [11:Applicant]{ESHKAR}
    [12:Rep]{}
    [13:Jobdes]{INSTALL ANSUL SYSTEM AT RESTAURANT. INST ALL SPRINKLERS AT DRYERS AT}
    [14:JAppLicNumberDisp]{0018190 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[8]
    [0:Pra3Isn]{0000006469}
    [1:Fd]{07231990}
    [2:Job]{100079852}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{GRD}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{05151991}
    [11:Applicant]{JONES}
    [12:Rep]{}
    [13:Jobdes]{ERECT 27 FEET OF SIDEWALK SHED FOR REPAI R OF FACADE. NO CHANGE IN USE}
    [14:JAppLicNumberDisp]{}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
-->


<html> 
<head>
    <title>Job Overview</title>
    <link rel=""stylesheet"" type=""text/css"" href=""bsqpm.css"" media=""screen"">
    <link rel=""stylesheet"" type""text/css"" href=""print.css"" media=""print"">
    <link rel=""shortcut icon"" href=""/favicon.ico"" type=""image/x-icon"" />
    <script language=""javascript"" src=""bis_lib.js""></script>
    <script language=""javascript"" src=""sorttable.js""></script>
    <script language=""javascript"">
    function $(eln)
    {
        return document.getElementById(eln);
    }
    </script>
        <script language=""javascript"">
    <!--
        function page(loc, ref)
        {
            //Commented out the usagelog creation for Caching on 4-30-15
            //var ce = (document.cookie == null || document.cookie == """") ? ""n"" : ""y"";
            //var u = 'Log/img.gif?m=pg&url='+escape(loc)+'&ref='+escape(ref)+'&ra='+Math.round(Math.random()*100000)+'&ce='+ce;
            //(new Image(1,1)).src = u;
        }

        //onclick=""page('/path/place', document.location);""

                var _gaq = _gaq || [];
                _gaq.push(['_setAccount', 'UA-16591777-1']);
                _gaq.push(['_trackPageview']);

                (function() {
                        var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
                        ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
                        var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
                })();

    //-->
    </script>

</head>


<body bgcolor=""#ffffff"" leftmargin=""0"" topmargin=""0"" marginheight=""0"" marginwidth=""0"" bgproperties=""fixed"">
<center>
    <table border=""0"" cellpadding=""3"" cellspacing=""0"" width=""100%"">  <!--- Start of Header --->
        <tr>
            <td class=""header"" align=""center"" colspan=""3"" width=""100%"">
            <DIV class=""noprint"">
                <table border=""0"" cellpadding=""0"" cellspacing=""0"" width=""100%"">
                    <tr>
                        <td class=""header"">
                &nbsp;&nbsp;<a href='bsqpm01.jsp'>BIS Menu</a>&nbsp;&nbsp;|&nbsp;&nbsp;Applications
                        </td>
                        <td class=""header2"" align=""right"">&nbsp;&nbsp;<a href=""http://www1.nyc.gov/site/buildings/homeowner/homeowner-faqs.page"">FAQs</a>&nbsp;&nbsp;|&nbsp;&nbsp;<a href=""http://www1.nyc.gov/site/buildings/about/acronym-glossary.page"">Glossary</a>
                &nbsp;&nbsp;
                            <script language=""javascript"">
                            <!---
                                print_today();
                            //--->
                            </script>
                                <script language=""javascript"">
    <!--
        function page(loc, ref)
        {
            //Commented out the usagelog creation for Caching on 4-30-15
            //var ce = (document.cookie == null || document.cookie == """") ? ""n"" : ""y"";
            //var u = 'Log/img.gif?m=pg&url='+escape(loc)+'&ref='+escape(ref)+'&ra='+Math.round(Math.random()*100000)+'&ce='+ce;
            //(new Image(1,1)).src = u;
        }

        //onclick=""page('/path/place', document.location);""

                var _gaq = _gaq || [];
                _gaq.push(['_setAccount', 'UA-16591777-1']);
                _gaq.push(['_trackPageview']);

                (function() {
                        var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
                        ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
                        var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
                })();

    //-->
    </script>

                        </td>
                    </tr>
                </table>
            </DIV>
            </td>
        </tr>
        <tr>
            <td colspan=2><a href=""http://www1.nyc.gov/site/buildings/index.page""><img src=""./images/doblogo_1.jpg"" alt=""DOB Logo - Link to Homepage"" border=""0""></a></td>  <!--- Cell for DOB Logo --->
            <td align=""right""><a href=""http://www1.nyc.gov/""><img src=""./images/nyclogo.gif"" alt=""NYC.gov Logo - Link to Homepage"" border=""0""></a>
                <br>
                <a href=""https://www.nyc.gov/portal/site/nycgov/menuitem.63099911d804683c09416f1076a09da0/"" onclick=""javascript:page('/ext/signupnews', document.location);"">
                    <img border=""0"" src=""images/clckhere.gif"" onmouseover=""this.src='images/clckhere_over.gif';"" onmouseout=""this.src='images/clckhere.gif';"">
                </a>
            </td>
        </tr>
        <tr>
            <td colspan=3 class=""nychdg"" align=""center""><b>NYC Department of Buildings</b></td>
        </tr>
        <tr>
            <td colspan=3 class=""mainhdg"" align=""center"">Job Overview</td>
        </tr>

    </table>        <!--- End of Header --->


<!---Start Message --->

虽然不太理想,但通过简单的复制粘贴,这确实克服了中间表格式化的困难。我很感激您可能希望使用“阵列”信息的方法更有条理


参考文献:

  • HTML对象库
  • Microsoft Forms 2.0对象库

  • 这只是HTML页面上的纯文本注释。我如何获取它?可以通过父-子解析吗?[1]etc表示什么(如果有的话)不清楚评论的目的是什么,或者是什么格式。它肯定不是XML。如果要解析它,需要逐行进行。@QHarr看起来比我一直使用的逐行打印输出要好。不熟悉outerHtml属性,需要对这些对象进行大量探索。谢谢你+1+1+1成功了:-)
    Sub getAndParse()
        Dim bin As String
            bin = 1054664
    
        Dim URLOne As String
            URLOne = "http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=" & bin
    
        Dim xmlOne As MSXML2.XMLHTTP60
            Set xmlOne = New MSXML2.XMLHTTP60
    
        Dim htmlOne As Object
            Set htmlOne = CreateObject("htmlfile")
    
        With xmlOne
            .Open "GET", URLOne
            .setRequestHeader "Content-Type", "application/xml"
            .send
        End With
    
        With xmlOne
            While Not .readyState = 4
                Application.Wait Now + TimeValue("0:00:01")
            Wend
            If .Status = 200 Then
                While InStr(1, .responseText, "Updating", 0) > 0
                    Application.Wait Now + TimeValue("0:00:01")
                Wend
                    htmlOne.body.innerHTML = .responseText
            Else
                MsgBox "Connection Unable To Be Made, Try Again"
                Exit Sub
            End If
        End With
    
    
        Debug.Print xmlOne.getAllResponseHeaders
        Stop
        [A1] = xmlOne.responseText
    
        For Each nde In htmlOne.body.Children
            Debug.Print nde.innerText
        Next nde
    
        Dim tblRow As MSHTML.HTMLTableRow
    
        For Each tblRow In tbl.Rows
            Debug.Print tblRow.innerText
        Next tblRow
    End Sub
    
    Option Explicit
    Public Sub GetInfo()
        Dim html As HTMLDocument, hTable As HTMLTable, clipboard As Object, xmlHttp As Object
        Set xmlHttp = CreateObject("MSXML2.XMLHTTP.6.0")
        Const URL As String = "http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=1054664"
        Application.ScreenUpdating = False
        Set html = GetHTMLDoc(URL, xmlHttp)
        With html
            Set hTable = .getElementsByTagName("table")(5)
            Set clipboard = New MSForms.DataObject
            clipboard.SetText hTable.outerHTML
            clipboard.PutInClipboard
            ActiveSheet.Cells(1, 1).PasteSpecial
        End With
        Application.ScreenUpdating = True
    End Sub
    
    Public Function GetHTMLDoc(ByVal URL As String, ByRef xmlHttp As Object) As HTMLDocument
        With xmlHttp
            .Open "GET", URL, False
            .setRequestHeader "Content-Type", "text/xml"
            .send
            Dim html As HTMLDocument
            Set html = New HTMLDocument
            html.body.innerHTML = .responseText
        End With
        Set GetHTMLDoc = html
    End Function