Vba 从网站抓取数据-在不同的播放器页面上接收错误
我正试图从basketball reference.com上搜集比赛日志数据。这对我选择的两名球员(德马尔·德罗赞和拉马库斯·阿尔德里奇)起到了完美的作用。但后来我开始调查其他球员,而这并不能为许多其他球员(凯文·杜兰特)搜集数据 我不知道为什么它不起作用。例如,我试过斯蒂芬·库里,效果很好,但像德雷蒙德·格林和凯文·杜兰特这样的玩家,代码根本不会刮取数据。由于某种原因,在列日期之后,一切都停止了工作Vba 从网站抓取数据-在不同的播放器页面上接收错误,vba,Vba,我正试图从basketball reference.com上搜集比赛日志数据。这对我选择的两名球员(德马尔·德罗赞和拉马库斯·阿尔德里奇)起到了完美的作用。但后来我开始调查其他球员,而这并不能为许多其他球员(凯文·杜兰特)搜集数据 我不知道为什么它不起作用。例如,我试过斯蒂芬·库里,效果很好,但像德雷蒙德·格林和凯文·杜兰特这样的玩家,代码根本不会刮取数据。由于某种原因,在列日期之后,一切都停止了工作 Sub Data() Dim ieObj As InternetExplorer
Sub Data()
Dim ieObj As InternetExplorer
Dim htmlEle As IHTMLElement
Dim i As Integer
i = 1
Set ieObj = New InternetExplorer
ieObj.Visible = True
ieObj.navigate "https://www.basketball-reference.com/players/d/duranke01/gamelog/2019"
Application.ScreenUpdating = False
Application.DisplayStatusBar = True
Application.Wait Now + TimeValue("00:00:05")
For Each htmlEle In ieObj.document.getElementsByClassName("stats_table")(0).getElementsByTagName("tr")
With ActiveSheet
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
.Range("D" & i).Value = htmlEle.Children(3).textContent
End With
i = i + 1
On Error Resume Next
Next htmlEle
End Sub
错误总是发生在这一行:
.Range(“D”&i).Value=htmlEle.Children(3).textContent
我尝试将列跳过,但仍然不起作用。我发现在表中使用id没有问题,如
.responseText
和使用over browser中所示
我只测试了3个URL-在表1 A1:A3中
在这个网站上,有时你会遇到评论中的表格,所以我在处理之前去掉了评论标签。这对于我尝试的链接是不必要的
我使用剪贴板复制粘贴,但您可以使用
Set hTable = html.getElementById("pgl_basic")
然后使用getElementsByTagName
根据需要循环tr
和td
参考资料(VBE>工具>参考资料):
选项显式
公共子GetPlayerInfo()
Dim URL(),i为长,html为HTMLDocument,hTable为对象
将ws作为工作表、wsCurrent作为对象、剪贴板作为对象
将最后一行变暗为长,playerIdentifier为字符串,arr()为字符串
设置剪贴板=GetObject(“新建:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}”)
设置ws=ThisWorkbook.Worksheets(“Sheet1”)
lastRow=ws.Cells(ws.rows.Count,“A”).End(xlUp).Row
URL=Application.Transpose(ws.Range(“A1:A”&lastRow.Value)
设置html=新的HTMLDocument
使用CreateObject(“MSXML2.XMLHTTP”)
对于i=LBound(URL)到UBound(URL)
.打开“获取”,URL(i),False
.发送
html.body.innerHTML=Replace$(Replace$(.responseText,“-->”,vbNullString),“奇怪的是,我确实检查了DOM,但没有发现可以防止这种情况发生的问题。尽管我对Selenium/BeautifulScript和Python更为熟悉(顺便说一句,您应该尝试使用它而不是VBA)因此,也许我遗漏了一些东西第一种方法的加载速度比第二种方法快得多。主要区别是什么。第一种方法不启动浏览器。因此,您没有呈现内容所需的开销或时间。缺点是,您不会获得呈现的内容,但在这种情况下,这并不重要。
Option Explicit
Public Sub GetPlayerInfo()
Dim urls(), i As Long, html As HTMLDocument, hTable As Object
Dim ws As Worksheet, wsCurrent As Object, clipboard As Object
Dim lastRow As Long, playerIdentifier As String, arr() As String
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).Row
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", urls(i), False
.send
html.body.innerHTML = Replace$(Replace$(.responseText, "-->", vbNullString), "<!--", vbNullString) 'remove comments
arr = Split(urls(i), "/")
playerIdentifier = arr(5)
If SheetExists(playerIdentifier) Then
With ThisWorkbook.Worksheets(playerIdentifier).Cells
.ClearContents
.ClearFormats
Set wsCurrent = ThisWorkbook.Worksheets(playerIdentifier)
End With
Else
Set wsCurrent = ThisWorkbook.Worksheets.Add
wsCurrent.name = playerIdentifier
End If
Set hTable = html.querySelector("#pgl_basic")
clipboard.SetText hTable.outerHTML
clipboard.PutInClipboard
wsCurrent.Range("A1").PasteSpecial
Set wsCurrent = Nothing: Set hTable = Nothing: Erase arr: playerIdentifier = vbNullString
Application.CutCopyMode = False
Next
End With
End Sub
Public Function SheetExists(ByVal sheetName As String) As Boolean '<== function by @Rory
SheetExists = Evaluate("ISREF('" & sheetName & "'!A1)")
End Function
Option Explicit
Public Sub GetPlayerInfo()
Dim ieObj As InternetExplorer, htmlEle As IHTMLElement
Dim urls(), i As Long, j As Long, hTable As Object
Dim ws As Worksheet, wsCurrent As Object
Dim lastRow As Long, playerIdentifier As String, arr() As String
Application.ScreenUpdating = False
On Error GoTo errHand
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
Set ieObj = New InternetExplorer
With ieObj
.Visible = True
For j = LBound(urls) To UBound(urls)
.navigate urls(j)
While .Busy Or .readyState <> 4: DoEvents: Wend
arr = Split(urls(j), "/")
playerIdentifier = arr(5)
If SheetExists(playerIdentifier) Then
With ThisWorkbook.Worksheets(playerIdentifier).Cells
.ClearContents
.ClearFormats
Set wsCurrent = ThisWorkbook.Worksheets(playerIdentifier)
End With
Else
Set wsCurrent = ThisWorkbook.Worksheets.Add
wsCurrent.Name = playerIdentifier
End If
i = 1
Set hTable = .document.getElementById("pgl_basic")
If Not hTable Is Nothing Then
For Each htmlEle In hTable.getElementsByTagName("tr")
With wsCurrent
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
.Range("D" & i).Value = htmlEle.Children(3).textContent
End With
i = i + 1
Next htmlEle
End If
Set wsCurrent = Nothing: Set hTable = Nothing: Erase arr: playerIdentifier = vbNullString
Next
End With
errHand:
Application.ScreenUpdating = True
If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
ie.Quit
End Sub