Search code examples
htmlexcelvbaweb-scrapinghref

Scraping the data from list of href link?


I am trying to scrap a list of href link from a webpage, and then trying to scrap the value out of it. I am now facing the problem which the code only can handle up to 5 links. If the links more than 5, it will show runtime error on random line.

I am extracting the href link from these webpage:http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=SH&sub_category=all&alphabetical=All&date_from=28/09/2018

Option Explicit
Sub ScrapLink()
    Dim IE As New InternetExplorer, html As HTMLDocument

    Application.ScreenUpdating = False

    With IE

        IE.Visible = False
        IE.navigate Cells(1, 1).Value

        While .Busy Or .readyState < 4: DoEvents: Wend
        Application.Wait Now + TimeSerial(0, 0, 3)
        Application.StatusBar = "Trying to go to website?"
        DoEvents

        Dim links As Object, i As Long
        Set links = .document.querySelectorAll("#bm_ajax_container [href^='/market/listed-companies/company-announcements/']")
        For i = 1 To links.Length
            With ThisWorkbook.Worksheets("Sheet1")
                .Cells(i + 1, 1) = links.item(i - 1)
            End With
        Next i
        .Quit
    End With
End Sub

Public Sub GetInfo()
    Dim IE As New InternetExplorer, headers(), u As Long, resultCollection As Collection
    headers = Array("URL", "Name", "No", "Date of change", "# Securities", "Type of Transaction", "Nature of Interest")
    Set resultCollection = New Collection
    Dim links()
    links = Application.Transpose(ThisWorkbook.Worksheets("Sheet1").Range("A2:A100"))

    With IE
        .Visible = True

        For u = LBound(links) To UBound(links)
            If InStr(links(u), "http") > 0 Then
                .navigate links(u)

                While .Busy Or .readyState < 4: DoEvents: Wend
                Application.Wait Now + TimeSerial(0, 0, 2)
                Dim data As Object, title As Object

                With .document.getElementById("bm_ann_detail_iframe").contentDocument
                    Set title = .querySelector(".formContentData")
                    Set data = .querySelectorAll(".ven_table tr")
                End With

                Dim results(), numberOfRows As Long, i As Long, currentRow As Object, td As Object, c As Long, r As Long

                numberOfRows = Round(data.Length / 4, 0)
                ReDim results(1 To numberOfRows, 1 To 7)

                For i = 0 To numberOfRows - 1
                    r = i + 1
                    results(r, 1) = links(u): results(r, 2) = title.innerText
                    Set currentRow = data.item(i * 4 + 1)
                    c = 3
                    For Each td In currentRow.getElementsByTagName("td")
                        results(r, c) = Replace$(td.innerText, "document.write(rownum++);", vbNullString)
                        c = c + 1
                    Next td
                Next i
                resultCollection.Add results
                Set data = Nothing: Set title = Nothing
            End If
        Next u
        .Quit
    End With
    Dim ws As Worksheet, item As Long
    If Not resultCollection.Count > 0 Then Exit Sub

    If Not Evaluate("ISREF('Results'!A1)") Then  '<==Credit to @Rory for this test
        Set ws = Worksheets.Add
        ws.NAME = "Results"
    Else
        Set ws = ThisWorkbook.Worksheets("Results")
        ws.Cells.Clear
    End If

    Dim outputRow As Long: outputRow = 2
    With ws
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        For item = 1 To resultCollection.Count
            Dim arr()
            arr = resultCollection(item)
            For i = LBound(arr, 1) To UBound(arr, 1)
                .Cells(outputRow, 1).Resize(1, 7) = Application.WorksheetFunction.Index(arr, i, 0)
                outputRow = outputRow + 1
            Next
        Next
    End With
End Sub

Solution

  • Discussion:

    The problem is likely, at least from my testing, due to one of the links not having the table Details of changes, so the numberOfRows variable is set to 0, and this line:

    ReDim results(1 To numberOfRows, 1 To 7)
    

    fails with an index error as you have (1 To 0, 1 To 7).

    Using this link in A1 there are 30 URLs retrieved. This retrieved link does not have that table whereas the others do.

    You have a choice of how to handle this scenario. Here are some example options:

    Option 1: Only process the page if the numberOfRows > 0. This is the example I give.

    Option 2: Have a Select Case with numberOfRows and if Case 0 then handle page in one way, Case Else handle as normal.


    Note:

    1) You also want to reset the status bar with:

    Application.StatusBar = False
    

    2) I temporarily fixed the links range for testing with:

    ThisWorkbook.Worksheets("Sheet1").Range("A2:A31")
    

    TODO:

    1. Refactor to be more modular and run the whole process with the same IE instance. Creating a class to hold the IE object would be a good idea. Provide it with methods for extracting your data, testing number of result rows etc.
    2. Add some basic error handling, for example, to handle failed website connection.

    Example handling using test of numberOfRows > 0:

    Option Explicit
    Sub ScrapeLink()
        Dim IE As New InternetExplorer
    
        Application.ScreenUpdating = False
    
        With IE
            IE.Visible = True
            IE.navigate Cells(1, 1).Value
    
            While .Busy Or .readyState < 4: DoEvents: Wend
           ' Application.Wait Now + TimeSerial(0, 0, 3)
            Application.StatusBar = "Trying to go to website?"
            DoEvents
    
            Dim links As Object, i As Long
            Set links = .document.querySelectorAll("#bm_ajax_container [href^='/market/listed-companies/company-announcements/']")
            For i = 1 To links.Length
                With ThisWorkbook.Worksheets("Sheet1")
                    .Cells(i + 1, 1) = links.item(i - 1)
                End With
            Next i
            .Quit
        End With
        Application.StatusBar = false
    End Sub
    
    Public Sub GetInfo()
        Dim IE As New InternetExplorer, headers(), u As Long, resultCollection As Collection
        headers = Array("URL", "Name", "No", "Date of change", "# Securities", "Type of Transaction", "Nature of Interest")
        Set resultCollection = New Collection
        Dim links()
        links = Application.Transpose(ThisWorkbook.Worksheets("Sheet1").Range("A2:A31")) '<== I have fixed the range here for testing 
    
        With IE
            .Visible = True
    
            For u = LBound(links) To UBound(links)
                If InStr(links(u), "http") > 0 Then
                    .navigate links(u)
    
                    While .Busy Or .readyState < 4: DoEvents: Wend
                    Application.Wait Now + TimeSerial(0, 0, 2)
                    Dim data As Object, title As Object
    
                    With .document.getElementById("bm_ann_detail_iframe").contentDocument
                        Set title = .querySelector(".formContentData")
                        Set data = .querySelectorAll(".ven_table tr")
                    End With
    
                    Dim results(), numberOfRows As Long, i As Long, currentRow As Object, td As Object, c As Long, r As Long
    
                    numberOfRows = Round(data.Length / 4, 0)
    
                    If numberOfRows > 0 Then
    
                        ReDim results(1 To numberOfRows, 1 To 7)
    
                        For i = 0 To numberOfRows - 1
                            r = i + 1
                            results(r, 1) = links(u): results(r, 2) = title.innerText
                            Set currentRow = data.item(i * 4 + 1)
                            c = 3
                            For Each td In currentRow.getElementsByTagName("td")
                                results(r, c) = Replace$(td.innerText, "document.write(rownum++);", vbNullString)
                                c = c + 1
                            Next td
                        Next i
                        resultCollection.Add results
                        Set data = Nothing: Set title = Nothing
                    End If
                End If
            Next u
            .Quit
        End With
        Dim ws As Worksheet, item As Long
        If Not resultCollection.Count > 0 Then Exit Sub
    
        If Not Evaluate("ISREF('Results'!A1)") Then  '<==Credit to @Rory for this test
            Set ws = Worksheets.Add
            ws.NAME = "Results"
        Else
            Set ws = ThisWorkbook.Worksheets("Results")
            ws.Cells.Clear
        End If
    
        Dim outputRow As Long: outputRow = 2
        With ws
            .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
            For item = 1 To resultCollection.Count
                Dim arr()
                arr = resultCollection(item)
                For i = LBound(arr, 1) To UBound(arr, 1)
                    .Cells(outputRow, 1).Resize(1, 7) = Application.WorksheetFunction.Index(arr, i, 0)
                    outputRow = outputRow + 1
                Next
            Next
        End With
    End Sub
    

    Sample results:

    enter image description here