Search code examples
excelvbaweb-scrapingxmlhttprequestqueryselector

How to get the content of this website to Excel with VBA and selectors?


My problem with a website:

Although the data changes regularly, the structure of the data always remains the same. I try to transfer the content (only the last two columns with the headers: Aktenzeichen and Aufgehoben) to excel in 3 columns (ID-Number, Date, Time) by splitting the values of Aufgehoben in date and time.

My problem is that the values in "Bundesland" and "Amtsgericht" columns (even though I don't need those) have a different frequency of occurrence than the rest of the data and mess up all the trs and tds in the html-structure and so I don't understand how to use the selectors! Any ideas? THX.

my...mmm...code:

Sub GetData()

    Const URL = "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML"
    Dim html As New HTMLDocument
    Dim elmt As Object
    Dim x As long

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .send
        html.body.innerHTML = .responseText
    End With
                   
     For x = 0 to ????.Length - 1
     Set elmt = html.querySelectorAll("???")
       ActiveSheet.Cells(y + 2, 2) = elmt.Item(?).innerText  'Aktenzeichen
       ActiveSheet.Cells(y + 2, 3) = elmt.Item(?).innerText  'Date
       ActiveSheet.Cells(y + 2, 4) = elmt.Item(?).innerText  'Time
     Next

End Sub

Solution

  • Work with arrays, touch the sheet only at the end to reduce I/O, convert to using css selectors and doing your filtering within those selectors, use typed functions and reduce the code complexity.


    Results:

    Difference over 10,000 runs (single request, multiple parses and placing data in sheet).

    enter image description here

    Median difference was 5.4 times faster to make those changes over 10,000 runs.

    No other difference in optimizations than described above. ScreenUpdating was switched off for both.


    VBA:

    Option Explicit
    
    Public r As Long
    
    Public Sub GetContents()
        
        Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
        
        Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
        
        With http
            .Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
            .send
            html.body.innerHTML = .responseText
        End With
        
        Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
        
        Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
        Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
        
        Dim headers() As Variant, results() As Variant
        
        headers = Array("ID-Number", "Date", "Time")
        ReDim results(1 To 1000, 1 To UBound(headers) + 1)
        
        With colOne
            
            For i = 0 To colOne.Length - 1
        
                UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText
    
            Next
            
        End With
    
        results = Application.Transpose(results)
        ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
        results = Application.Transpose(results)
        
        With ActiveSheet
            .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
            .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
        End With
    End Sub
    
    Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String)
        Dim arrCol1() As String, arrCol2And3() As String
        Dim i As Long, datetime() As String
        
        arrCol1 = Split(col1, Chr$(10))
        arrCol2And3 = Split(col2And3, vbCrLf)
        
        For i = LBound(arrCol1) To UBound(arrCol1)
            r = r + 1
            results(r, 1) = Trim$(arrCol1(i))
            datetime = Split(arrCol2And3(i), Chr$(32))
            results(r, 2) = datetime(0): results(r, 3) = datetime(1)
        Next
    End Sub
    

    Passing r as param in signature (rather than Public):

    Option Explicit
    
    Public Sub GetContents()
        
        Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
        
        Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
        
        With http
            .Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
            .send
            html.body.innerHTML = .responseText
        End With
        
        Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
        
        'Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
        Set colOne = html.querySelectorAll("td td + td[style*='150']")
        'Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
        Set colTwoAndThree = html.querySelectorAll("td td + td[style*='150'] + td")
        
        Dim headers() As Variant, results() As Variant
        
        headers = Array("ID-Number", "Date", "Time")
        ReDim results(1 To 1000, 1 To UBound(headers) + 1)
        
        With colOne
            
            For i = 0 To colOne.Length - 1
        
                UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText, r
    
            Next
            
        End With
    
        results = Application.Transpose(results)
        ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
        results = Application.Transpose(results)
        
        With ActiveSheet
            .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
            .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
        End With
    End Sub
    
    Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String, ByRef r As Long)
        Dim arrCol1() As String, arrCol2And3() As String
        Dim i As Long, datetime() As String
        
        arrCol1 = Split(col1, Chr$(10))
        arrCol2And3 = Split(col2And3, vbCrLf)
        
        For i = LBound(arrCol1) To UBound(arrCol1)
            r = r + 1
            results(r, 1) = Trim$(arrCol1(i))
            datetime = Split(arrCol2And3(i), Chr$(32))
            results(r, 2) = datetime(0): results(r, 3) = datetime(1)
        Next
    End Sub