My problem with a website:
Although the data changes regularly, the structure of the data always remains the same. I try to transfer the content (only the last two columns with the headers: Aktenzeichen and Aufgehoben) to excel in 3 columns (ID-Number, Date, Time) by splitting the values of Aufgehoben in date and time.
My problem is that the values in "Bundesland" and "Amtsgericht" columns (even though I don't need those) have a different frequency of occurrence than the rest of the data and mess up all the trs and tds in the html-structure and so I don't understand how to use the selectors! Any ideas? THX.
my...mmm...code:
Sub GetData()
Const URL = "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML"
Dim html As New HTMLDocument
Dim elmt As Object
Dim x As long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
html.body.innerHTML = .responseText
End With
For x = 0 to ????.Length - 1
Set elmt = html.querySelectorAll("???")
ActiveSheet.Cells(y + 2, 2) = elmt.Item(?).innerText 'Aktenzeichen
ActiveSheet.Cells(y + 2, 3) = elmt.Item(?).innerText 'Date
ActiveSheet.Cells(y + 2, 4) = elmt.Item(?).innerText 'Time
Next
End Sub
Work with arrays, touch the sheet only at the end to reduce I/O, convert to using css selectors and doing your filtering within those selectors, use typed functions and reduce the code complexity.
Results:
Difference over 10,000 runs (single request, multiple parses and placing data in sheet).
Median difference was 5.4 times faster to make those changes over 10,000 runs.
No other difference in optimizations than described above. ScreenUpdating
was switched off for both.
VBA:
Option Explicit
Public r As Long
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
html.body.innerHTML = .responseText
End With
Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
Dim headers() As Variant, results() As Variant
headers = Array("ID-Number", "Date", "Time")
ReDim results(1 To 1000, 1 To UBound(headers) + 1)
With colOne
For i = 0 To colOne.Length - 1
UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText
Next
End With
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String)
Dim arrCol1() As String, arrCol2And3() As String
Dim i As Long, datetime() As String
arrCol1 = Split(col1, Chr$(10))
arrCol2And3 = Split(col2And3, vbCrLf)
For i = LBound(arrCol1) To UBound(arrCol1)
r = r + 1
results(r, 1) = Trim$(arrCol1(i))
datetime = Split(arrCol2And3(i), Chr$(32))
results(r, 2) = datetime(0): results(r, 3) = datetime(1)
Next
End Sub
Passing r as param in signature (rather than Public):
Option Explicit
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
html.body.innerHTML = .responseText
End With
Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
'Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
Set colOne = html.querySelectorAll("td td + td[style*='150']")
'Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
Set colTwoAndThree = html.querySelectorAll("td td + td[style*='150'] + td")
Dim headers() As Variant, results() As Variant
headers = Array("ID-Number", "Date", "Time")
ReDim results(1 To 1000, 1 To UBound(headers) + 1)
With colOne
For i = 0 To colOne.Length - 1
UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText, r
Next
End With
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String, ByRef r As Long)
Dim arrCol1() As String, arrCol2And3() As String
Dim i As Long, datetime() As String
arrCol1 = Split(col1, Chr$(10))
arrCol2And3 = Split(col2And3, vbCrLf)
For i = LBound(arrCol1) To UBound(arrCol1)
r = r + 1
results(r, 1) = Trim$(arrCol1(i))
datetime = Split(arrCol2And3(i), Chr$(32))
results(r, 2) = datetime(0): results(r, 3) = datetime(1)
Next
End Sub