Search code examples
jsonvbaexcelweb-scrapingxmlhttprequest

Loop through each table on javascrape webpage with VBA macro


I'm trying to webscrape multiple tables from a website. So far I have built an excel VBA macro to do this. I also figured out how to get all the data when it is on multiple pages in the website. For instance, if I have 1000 results but 50 are displayed on each page. The problem is that I have the same 5 tables on multiple pages because each table has 1000 results.

My code can only loop through each page for 1 table. I also have written code to grab each table, but I cannot figure out how to do that for each of the 50 search results (each page).

How can I loop through multiple tables and click the next page in the process to capture all the data?

Sub ETFDat()

    Dim IE As Object
    Dim i As Long
    Dim strText As String
    Dim jj As Long
    Dim hBody As Object
    Dim hTR As Object
    Dim hTD As Object
    Dim tb As Object
    Dim bb As Object
    Dim Tr As Object
    Dim Td As Object
    Dim ii As Long
    Dim doc As Object
    Dim hTable As Object
    Dim y As Long
    Dim z As Long
    Dim wb As Excel.Workbook
    Dim ws As Excel.Worksheet

    Set wb = Excel.ActiveWorkbook
    Set ws = wb.ActiveSheet
    Set IE = CreateObject("InternetExplorer.Application")
    IE.Visible = True
    y = 1   'Column A in Excel
    z = 1   'Row 1 in Excel
    Sheets("Fund Basics").Activate
    Cells.Select
    Selection.Clear

    IE.navigate "http://www.etf.com/channels/smart-beta-etfs/channels/smart-       beta-etfs?qt-tabs=0#qt-tabs" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
    Do While IE.busy: DoEvents: Loop
    Do While IE.ReadyState <> 4: DoEvents: Loop
    Set doc = IE.document
    Set hTable = doc.getElementsByTagName("table")    '.GetElementByID("tablePerformance")
    ii = 1
    Do While ii <= 17
        For Each tb In hTable
            Set hBody = tb.getElementsByTagName("tbody")
            For Each bb In hBody
                Set hTR = bb.getElementsByTagName("tr")
                For Each Tr In hTR
                    Set hTD = Tr.getElementsByTagName("td")
                    y = 1 ' Resets back to column A
                    For Each Td In hTD
                        ws.Cells(z, y).Value = Td.innerText
                        y = y + 1
                    Next Td
                    DoEvents
                    z = z + 1
                Next Tr
                Exit For
            Next bb
            Exit For
        Next tb
        With doc
            Set elems = .getElementsByTagName("a")
            For Each e In elems
                If (e.getAttribute("id") = "nextPage") Then
                    e.Click
                    Exit For
                End If
            Next e
        End With
        ii = ii + 1
        Application.Wait (Now + TimeValue("00:00:05"))
    Loop

    MsgBox "Done"

End Sub

Solution

  • There is the example showing how the data could be retrieved from the website using XHRs and JSON parsing, it consists of several steps.

    1. Retrieve the data.

    I looked into a little with XHRs using Chrome Developer Tools Network tab. Most relevant data I found is JSON string returned by GET XHR from http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/50/50/1 after I clicked the next page button:

    GET XHR

    The response has the following structure for single row item:

    [
      {
        "productId": 576,
        "fund": "iShares Russell 1000 Value ETF",
        "ticker": "IWD",
        "inceptionDate": "2000-05-22",
        "launchDate": "2000-05-22",
        "hasSegmentReport": "true",
        "genericReport": "false",
        "hasReport": "true",
        "fundsInSegment": 20,
        "economicDevelopment": "Developed Markets",
        "totalRows": 803,
        "fundBasics": {
          "issuer": "<a href='/channels/blackrock-etfs' alt='BlackRock'>BlackRock</a>",
          "expenseRatio": {
            "value": 20
          },
          "aum": {
            "value": 36957230250
          },
          "spreadPct": {
            "value": 0.000094
          },
          "segment": "Equity: U.S. - Large Cap Value"
        },
        "performance": {
          "priceTrAsOf": "2017-02-27",
          "priceTr1Mo": {
            "value": 0.031843
          },
          "priceTr3Mo": {
            "value": 0.070156
          },
          "priceTr1Yr": {
            "value": 0.281541
          },
          "priceTr3YrAnnualized": {
            "value": 0.099171
          },
          "priceTr5YrAnnualized": {
            "value": 0.13778
          },
          "priceTr10YrAnnualized": {
            "value": 0.061687
          }
        },
        "analysis": {
          "analystPick": null,
          "opportunitiesList": null,
          "letterGrade": "A",
          "efficiencyScore": 97.977103,
          "tradabilityScore": 99.260541,
          "fitScore": 84.915658,
          "leveragedFactor": null,
          "exposureReset": null,
          "avgDailyDollarVolume": 243848188.037378,
          "avgDailyShareVolume": 2148400.688889,
          "spread": {
            "value": 0.010636
          },
          "fundClosureRisk": "Low"
        },
        "fundamentals": {
          "dividendYield": {
            "value": 0.021543
          },
          "equity": {
            "pe": 27.529645,
            "pb": 1.964124
          },
          "fixedIncome": {
            "duration": null,
            "creditQuality": null,
            "ytm": {
              "value": null
            }
          }
        },
        "classification": {
          "assetClass": "Equity",
          "strategy": "Value",
          "region": "North America",
          "geography": "U.S.",
          "category": "Size and Style",
          "focus": "Large Cap",
          "niche": "Value",
          "inverse": "false",
          "leveraged": "false",
          "etn": "false",
          "selectionCriteria": "Multi-Factor",
          "weightingScheme": "Multi-Factor",
          "activePerSec": "false",
          "underlyingIndex": "Russell 1000 Value Index",
          "indexProvider": "Russell",
          "brand": "iShares"
        },
        "tax": {
          "legalStructure": "Open-Ended Fund",
          "maxLtCapitalGainsRate": 20,
          "maxStCapitalGainsRate": 39.6,
          "taxReporting": "1099"
        }
      }
    ]
    
    1. The property "totalRows": 803 specifies the total rows count. So to make data retrieving as fast as it possible, better to make the request to get the first row. As you can see from the URL, there is ../-aum/50/50/.. tail, which points sorting order, item to start from, and total items to return. Thus to get the only row it should be http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1

    2. Parse retrieved JSON, get the total number of rows from totalRows property.

    3. Make another one request to get the entire table.

    4. Parse the entire table JSON, convert it to 2d array and output. You can perform further processing with direct access to the array.

    For the table shown below:

    table

    The resulting table contains 803 rows and header with columns as follows:

    productId
    fund
    ticker
    inceptionDate
    launchDate
    hasSegmentReport
    genericReport
    hasReport
    fundsInSegment
    economicDevelopment
    totalRows
    fundBasics_issuer
    fundBasics_expenseRatio_value
    fundBasics_aum_value
    fundBasics_spreadPct_value
    fundBasics_segment
    performance_priceTrAsOf
    performance_priceTr1Mo_value
    performance_priceTr3Mo_value
    performance_priceTr1Yr_value
    performance_priceTr3YrAnnualized_value
    performance_priceTr5YrAnnualized_value
    performance_priceTr10YrAnnualized_value
    analysis_analystPick
    analysis_opportunitiesList
    analysis_letterGrade
    analysis_efficiencyScore
    analysis_tradabilityScore
    analysis_fitScore
    analysis_leveragedFactor
    analysis_exposureReset
    analysis_avgDailyDollarVolume
    analysis_avgDailyShareVolume
    analysis_spread_value
    analysis_fundClosureRisk
    fundamentals_dividendYield_value
    fundamentals_equity_pe
    fundamentals_equity_pb
    fundamentals_fixedIncome_duration
    fundamentals_fixedIncome_creditQuality
    fundamentals_fixedIncome_ytm_value
    classification_assetClass
    classification_strategy
    classification_region
    classification_geography
    classification_category
    classification_focus
    classification_niche
    classification_inverse
    classification_leveraged
    classification_etn
    classification_selectionCriteria
    classification_weightingScheme
    classification_activePerSec
    classification_underlyingIndex
    classification_indexProvider
    classification_brand
    tax_legalStructure
    tax_maxLtCapitalGainsRate
    tax_maxStCapitalGainsRate
    tax_taxReporting
    

    Put the below code into VBA Project standard module:

    Option Explicit
    
    Sub GetData()
    
        Dim sJSONString As String
        Dim vJSON As Variant
        Dim sState As String
        Dim lRowsQty As Long
        Dim aData()
        Dim aHeader()
    
        ' Download and parse the only first row to get total rows qty
        sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1")
        JSON.Parse sJSONString, vJSON, sState
        lRowsQty = vJSON(0)("totalRows")
        ' Download and parse the entire data
        sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/" & lRowsQty & "/1")
        JSON.Parse sJSONString, vJSON, sState
        ' Convert JSON to 2d array
        JSON.ToArray vJSON, aData, aHeader
        ' Output
        With Sheets(1)
            .Cells.Delete
            OutputArray .Cells(1, 1), aHeader
            Output2DArray .Cells(2, 1), aData
            .Cells.Columns.AutoFit
        End With
    
    End Sub
    
    Function GetXHR(sURL As String) As String
    
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", sURL, False
            .Send
            GetXHR = .responseText
        End With
    
    End Function
    
    Sub OutputArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    1, _
                    UBound(aCells) - LBound(aCells) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    

    Create one more standard module, name it JSON and put the below code into it, this code provides JSON processing functionality:

    Option Explicit
    
    Private sBuffer As String
    Private oTokens As Object
    Private oRegEx As Object
    Private bMatch As Boolean
    Private oChunks As Object
    Private oHeader As Object
    Private aData() As Variant
    Private i As Long
    
    Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
    
        ' Backus–Naur form JSON parser implementation based on RegEx
        ' Input:
        ' sSample - source JSON string
        ' Output:
        ' vJson - created object or array to be returned as result
        ' sState - string Object|Array|Error depending on processing
    
        sBuffer = sSample
        Set oTokens = CreateObject("Scripting.Dictionary")
        Set oRegEx = CreateObject("VBScript.RegExp")
        With oRegEx ' Patterns based on specification http://www.json.org/
            .Global = True
            .MultiLine = True
            .IgnoreCase = True ' Unspecified True, False, Null accepted
            .Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
            Tokenize "s"
            .Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
            Tokenize "d"
            .Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
            Tokenize "c"
            .Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
            Tokenize "n"
            .Pattern = "\s+"
            sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
            .MultiLine = False
            Do
                bMatch = False
                .Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
                Tokenize "p"
                .Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
                Tokenize "o"
                .Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
                Tokenize "a"
            Loop While bMatch
            .Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
            If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
                Retrieve sBuffer, vJSON
                sState = IIf(IsObject(vJSON), "Object", "Array")
            Else
                vJSON = Null
                sState = "Error"
            End If
        End With
        Set oTokens = Nothing
        Set oRegEx = Nothing
    
    End Sub
    
    Private Sub Tokenize(sType)
    
        Dim aContent() As String
        Dim lCopyIndex As Long
        Dim i As Long
        Dim sKey As String
    
        With oRegEx.Execute(sBuffer)
            If .Count = 0 Then Exit Sub
            ReDim aContent(0 To .Count - 1)
            lCopyIndex = 1
            For i = 0 To .Count - 1
                With .Item(i)
                    sKey = "<" & oTokens.Count & sType & ">"
                    oTokens(sKey) = .Value
                    aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
                    lCopyIndex = .FirstIndex + .Length + 1
                End With
            Next
        End With
        sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
        bMatch = True
    
    End Sub
    
    Private Sub Retrieve(sTokenKey, vTransfer)
    
        Dim sTokenValue As String
        Dim sName As String
        Dim vValue As Variant
        Dim aTokens() As String
        Dim i As Long
    
        sTokenValue = oTokens(sTokenKey)
        With oRegEx
            .Global = True
            Select Case Left(Right(sTokenKey, 2), 1)
                Case "o"
                    Set vTransfer = CreateObject("Scripting.Dictionary")
                    aTokens = Split(sTokenValue, "<")
                    For i = 1 To UBound(aTokens)
                        Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
                    Next
                Case "p"
                    aTokens = Split(sTokenValue, "<", 4)
                    Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
                    Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
                    If IsObject(vValue) Then
                        Set vTransfer(sName) = vValue
                    Else
                        vTransfer(sName) = vValue
                    End If
                Case "a"
                    aTokens = Split(sTokenValue, "<")
                    If UBound(aTokens) = 0 Then
                        vTransfer = Array()
                    Else
                        ReDim vTransfer(0 To UBound(aTokens) - 1)
                        For i = 1 To UBound(aTokens)
                            Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
                            If IsObject(vValue) Then
                                Set vTransfer(i - 1) = vValue
                            Else
                                vTransfer(i - 1) = vValue
                            End If
                        Next
                    End If
                Case "n"
                    vTransfer = sTokenValue
                Case "s"
                    vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
                        Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
                        "\""", """"), _
                        "\\", "\"), _
                        "\/", "/"), _
                        "\b", Chr(8)), _
                        "\f", Chr(12)), _
                        "\n", vbLf), _
                        "\r", vbCr), _
                        "\t", vbTab)
                    .Global = False
                    .Pattern = "\\u[0-9a-fA-F]{4}"
                    Do While .Test(vTransfer)
                        vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
                    Loop
                Case "d"
                    vTransfer = Evaluate(sTokenValue)
                Case "c"
                    Select Case LCase(sTokenValue)
                        Case "true"
                            vTransfer = True
                        Case "false"
                            vTransfer = False
                        Case "null"
                            vTransfer = Null
                    End Select
            End Select
        End With
    
    End Sub
    
    Function Serialize(vJSON As Variant) As String
    
        Set oChunks = CreateObject("Scripting.Dictionary")
        SerializeElement vJSON, ""
        Serialize = Join(oChunks.Items(), "")
        Set oChunks = Nothing
    
    End Function
    
    Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)
    
        Dim aKeys() As Variant
        Dim i As Long
    
        With oChunks
            Select Case VarType(vElement)
                Case vbObject
                    If vElement.Count = 0 Then
                        .Item(.Count) = "{}"
                    Else
                        .Item(.Count) = "{" & vbCrLf
                        aKeys = vElement.Keys
                        For i = 0 To UBound(aKeys)
                            .Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
                            SerializeElement vElement(aKeys(i)), sIndent & vbTab
                            If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
                            .Item(.Count) = vbCrLf
                        Next
                        .Item(.Count) = sIndent & "}"
                    End If
                Case Is >= vbArray
                    If UBound(vElement) = -1 Then
                        .Item(.Count) = "[]"
                    Else
                        .Item(.Count) = "[" & vbCrLf
                        For i = 0 To UBound(vElement)
                            .Item(.Count) = sIndent & vbTab
                            SerializeElement vElement(i), sIndent & vbTab
                            If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
                            .Item(.Count) = vbCrLf
                        Next
                        .Item(.Count) = sIndent & "]"
                    End If
                Case vbInteger, vbLong
                    .Item(.Count) = vElement
                Case vbSingle, vbDouble
                    .Item(.Count) = Replace(vElement, ",", ".")
                Case vbNull
                    .Item(.Count) = "null"
                Case vbBoolean
                    .Item(.Count) = IIf(vElement, "true", "false")
                Case Else
                    .Item(.Count) = """" & _
                        Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
                            "\", "\\"), _
                            """", "\"""), _
                            "/", "\/"), _
                            Chr(8), "\b"), _
                            Chr(12), "\f"), _
                            vbLf, "\n"), _
                            vbCr, "\r"), _
                            vbTab, "\t") & _
                        """"
            End Select
        End With
    
    End Sub
    
    Function ToString(vJSON As Variant) As String
    
        Select Case VarType(vJSON)
            Case vbObject, Is >= vbArray
                Set oChunks = CreateObject("Scripting.Dictionary")
                ToStringElement vJSON, ""
                oChunks.Remove 0
                ToString = Join(oChunks.Items(), "")
                Set oChunks = Nothing
            Case vbNull
                ToString = "Null"
            Case vbBoolean
                ToString = IIf(vJSON, "True", "False")
            Case Else
                ToString = CStr(vJSON)
        End Select
    
    End Function
    
    Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)
    
        Dim aKeys() As Variant
        Dim i As Long
    
        With oChunks
            Select Case VarType(vElement)
                Case vbObject
                    If vElement.Count = 0 Then
                        .Item(.Count) = "''"
                    Else
                        .Item(.Count) = vbCrLf
                        aKeys = vElement.Keys
                        For i = 0 To UBound(aKeys)
                            .Item(.Count) = sIndent & aKeys(i) & ": "
                            ToStringElement vElement(aKeys(i)), sIndent & vbTab
                            If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
                        Next
                    End If
                Case Is >= vbArray
                    If UBound(vElement) = -1 Then
                        .Item(.Count) = "''"
                    Else
                        .Item(.Count) = vbCrLf
                        For i = 0 To UBound(vElement)
                            .Item(.Count) = sIndent & i & ": "
                            ToStringElement vElement(i), sIndent & vbTab
                            If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
                        Next
                    End If
                Case vbNull
                    .Item(.Count) = "Null"
                Case vbBoolean
                    .Item(.Count) = IIf(vElement, "True", "False")
                Case Else
                    .Item(.Count) = CStr(vElement)
            End Select
        End With
    
    End Sub
    
    Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)
    
        ' Input:
        ' vJSON - Array or Object which contains rows data
        ' Output:
        ' aData - 2d array representing JSON data
        ' aHeader - 1d array of property names
    
        Dim sName As Variant
    
        Set oHeader = CreateObject("Scripting.Dictionary")
        Select Case VarType(vJSON)
            Case vbObject
                If vJSON.Count > 0 Then
                    ReDim aData(0 To vJSON.Count - 1, 0 To 0)
                    oHeader("#") = 0
                    i = 0
                    For Each sName In vJSON
                        aData(i, 0) = "#" & sName
                        ToArrayElement vJSON(sName), ""
                        i = i + 1
                    Next
                Else
                    ReDim aData(0 To 0, 0 To 0)
                End If
            Case Is >= vbArray
                If UBound(vJSON) >= 0 Then
                    ReDim aData(0 To UBound(vJSON), 0 To 0)
                    For i = 0 To UBound(vJSON)
                        ToArrayElement vJSON(i), ""
                    Next
                Else
                    ReDim aData(0 To 0, 0 To 0)
                End If
            Case Else
                ReDim aData(0 To 0, 0 To 0)
                aData(0, 0) = ToString(vJSON)
        End Select
        aHeader = oHeader.Keys()
        Set oHeader = Nothing
        aRows = aData
        Erase aData
    
    End Sub
    
    Private Sub ToArrayElement(vElement As Variant, sFieldName As String)
    
        Dim sName As Variant
        Dim j As Long
    
        Select Case VarType(vElement)
            Case vbObject ' collection of objects
                For Each sName In vElement
                    ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
                Next
            Case Is >= vbArray  ' collection of arrays
                For j = 0 To UBound(vElement)
                    ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
                Next
            Case Else
                If Not oHeader.Exists(sFieldName) Then
                    oHeader(sFieldName) = oHeader.Count
                    If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
                End If
                j = oHeader(sFieldName)
                aData(i, j) = ToString(vElement)
        End Select
    
    End Sub