Search code examples
web-scrapingweb-crawlerscrapegoogle-crawlers

How to scrape all possible results from a search bar of a website


This is my first web scraping task. I have been tasked with scraping a website

It is a site that contains the names of lawyers in Denmark. My difficulty is that I can only retrieve names based on the particular name query i put in the search bar. Is there an online web tool I can use to scrape all the names that the website contains? I have used tools like Import.io with no success so far. I am super confused on how all of this works.


Solution

  • Please scroll down to UPDATE 2

    The website enforces you to enter at least one search parameter, so you may loop through all items for Arbejdsområde list, making request for each of them. Here is the example, showing how that could be done in Excel VBA (open VBE, create standard module, paste the code and run Test()):

    Option Explicit
    
    Sub Test()
    
        Dim sResponse As String
        Dim oItems As Object
        Dim vItem
        Dim aData
        Dim sContent As String
        Dim lPage As Long
        Dim i As Long
        Dim j As Long
    
        ' Retrieve search page HTML content
        XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
        ' Extract work areas items
        ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$AreaSelect", oItems
        oItems.Remove oItems.Keys()(0)
        sContent = ""
        ' Process each work area item
        For Each vItem In oItems.Items()
            Debug.Print "Item [" & vItem & "]"
            lPage = 0
            ' Process each results page
            Do
                Debug.Print vbTab & "Page [" & lPage & "]"
                ' Retrieve result page HTML content
                XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&a=" & vItem & "&p=" & lPage, "", "", "", sResponse
                ' Extract result table
                ParseResponse _
                    "<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                    sResponse, _
                    aData, _
                    False
                ' Store parsed table
                sContent = sContent & aData(0)
                Debug.Print vbTab & "Parsed " & Len(sContent)
                lPage = lPage + 1
                DoEvents
            Loop Until InStr(sResponse, "<a class=""next""") = 0
        Next
        ' Extract data from the whole content
        ParseResponse _
            "<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "</tr>", _
            sContent, _
            aData, _
            False
        ' Rebuild nested arrays to 2d array for output
        aData = Denestify(aData)
        ' Decode HTML
        For i = 1 To UBound(aData, 1)
            For j = 2 To 4
                aData(i, j) = GetInnerText((aData(i, j)))
            Next
        Next
        ' Output
        With ThisWorkbook.Sheets(1)
            .Cells.Delete
            Output2DArray .Cells(1, 1), aData
            .Columns.AutoFit
            .Rows.AutoFit
        End With
        MsgBox "Completed"
    
    End Sub
    
    Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
    
        Dim aHeader
    
        'With CreateObject("MSXML2.ServerXMLHTTP")
            '.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        With CreateObject("MSXML2.XMLHTTP")
            .Open sMethod, sUrl, False ' , "u051772", "fy17janr"
            If IsArray(aSetHeaders) Then
                For Each aHeader In aSetHeaders
                    .SetRequestHeader aHeader(0), aHeader(1)
                Next
            End If
            .Send (sFormData)
            sRespHeaders = .GetAllResponseHeaders
            sRespText = .ResponseText
        End With
    
    End Sub
    
    Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
    
        Dim aTmp0
        Dim vItem
    
        ' Escape RegEx special characters
        For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
            sName = Replace(sName, vItem, "\" & vItem)
        Next
        ' Extract the whole <select> for parameter
        ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
        ' Extract each parameter <option>
        ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
        ' Put each parameter and value into dictionary
        Set oOptions = CreateObject("Scripting.Dictionary")
        For Each vItem In aTmp0
            oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
        Next
    
    End Sub
    
    Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
    
        Dim oMatch
        Dim aTmp0()
        Dim sSubMatch
    
        If Not (IsArray(aData) And bAppend) Then aData = Array()
        With CreateObject("VBScript.RegExp")
            .Global = bGlobal
            .MultiLine = bMultiLine
            .IgnoreCase = bIgnoreCase
            .Pattern = sPattern
            For Each oMatch In .Execute(sResponse)
                If oMatch.SubMatches.Count = 1 Then
                    PushItem aData, oMatch.SubMatches(0)
                Else
                    aTmp0 = Array()
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aTmp0, sSubMatch
                    Next
                    PushItem aData, aTmp0
                End If
            Next
        End With
    
    End Sub
    
    Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
    
        If Not (IsArray(aData) And bAppend) Then aData = Array()
        ReDim Preserve aData(UBound(aData) + 1)
        aData(UBound(aData)) = vItem
    
    End Sub
    
    Function GetInnerText(sText As String) As String
    
        Static oHtmlfile As Object
        Static oDiv As Object
    
        If oHtmlfile Is Nothing Then
            Set oHtmlfile = CreateObject("htmlfile")
            oHtmlfile.Open
            Set oDiv = oHtmlfile.createElement("div")
        End If
        oDiv.innerHTML = sText
        GetInnerText = oDiv.innerText
    
    End Function
    
    Function Denestify(aRows)
    
        Dim aData()
        Dim aItems()
        Dim i As Long
        Dim j As Long
    
        If UBound(aRows) = -1 Then Exit Function
        ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
        For j = 0 To UBound(aRows)
            If IsArray(aRows(j)) Then
                aItems = aRows(j)
                For i = 0 To UBound(aItems)
                    If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                    aData(j + 1, i + 1) = aItems(i)
                Next
            Else
                aData(j + 1, 1) = aRows(j)
            End If
        Next
        Denestify = aData
    
    End Function
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    

    It takes few minutes to retrieve all data for the first time (after that when launched again all requests are loaded from the cache that makes process significantly faster, to get a latest data from the server you need to clean up the cache in IE settings). The output for me is as follows:

    output

    Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor.

    BTW there are another answers using the similar approach: 1, 2, 3 and 4.

    UPDATE

    The above suggested scraping is based on parsing search results filtered by Arbejdsområde parameter, and as it turned out, actually returned results are inaccurate. Those lawyers, which have multiply Arbejdsområder are present multiply times in results, and which have empty Arbejdsområder are not in results at all.

    Another parameter instead of Arbejdsområde, that can be used for such scraping is Retskreds. All lawyers records contain address, and only single address, so results are full and don't contain duplicates. Note, one lawyer can relate to several offices, so that will be several records in results.

    There is the code that allows to scrape detailed info for each entry within loop:

    Option Explicit
    
    Sub Test()
    
        Dim sResponse As String
        Dim oItems As Object
        Dim vKey
        Dim sItem As String
        Dim aTmp
        Dim aData
        Dim lPage As Long
        Dim i As Long
        Dim j As Long
    
        ' Retrieve search page HTML content
        XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
        ' Extract Retskreds items
        ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$CourtSelect", oItems
        oItems.Remove oItems.Keys()(0)
        i = 0
        ' Process each Retskreds item
        For Each vKey In oItems
            sItem = oItems(vKey)
            Debug.Print "Area " & sItem & " " & vKey
            lPage = 0
            ' Process each results page
            Do
                Debug.Print vbTab & "Page " & lPage
                ' Retrieve results page
                XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&c=" & sItem & "&p=" & lPage, "", "", "", sResponse
                ' Extract table
                ParseResponse _
                    "<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                    sResponse, _
                    aTmp, _
                    False
                ' Extract data from the table
                ParseResponse _
                    "<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
                    "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                    "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                    "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                    "</tr>", _
                    aTmp(0), _
                    aData, _
                    True
                ' Add Retskreds name
                For i = i To UBound(aData)
                    aTmp = aData(i)
                    PushItem aTmp, vKey
                    aData(i) = aTmp
                Next
                Debug.Print vbTab & "Parsed " & UBound(aData)
                lPage = lPage + 1
                DoEvents
            Loop Until InStr(sResponse, "<a class=""next""") = 0
        Next
        ' Retrieve detailed info for each entry
        For i = 0 To UBound(aData)
            aTmp = aData(i)
            ' Retrieve details page
            aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
            ' Extract details
            XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
            ParseResponse _
                DecodeUriComponent( _
                    "Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
                    "Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
                    "F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?" & _
                    "M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
                    "M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
                    "E-mail\: [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?" & _
                    "Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
                sResponse, _
                aTmp, _
                True, _
                False
            aTmp(9) = StrReverse(aTmp(9))
            aData(i) = aTmp
            Debug.Print vbTab & "Details " & i
            DoEvents
        Next
        ' Rebuild nested arrays to 2d array for output
        aData = Denestify(aData)
        ' Decode HTML
        For i = 1 To UBound(aData, 1)
            For j = 2 To 4
                aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
            Next
        Next
        ' Output
        With ThisWorkbook.Sheets(1)
            .Cells.Delete
            OutputArray .Cells(1, 1), _
                Array("URL", _
                    "Navn", _
                    "Firma", _
                    DecodeUriComponent("Arbejdsomr%C3%A5der"), _
                    DecodeUriComponent("Retskreds"), _
                    DecodeUriComponent("Beskikkelses%C3%A5r"), _
                    DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
                    DecodeUriComponent("M%C3%B8deret for landsret"), _
                    DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
                    "E-mail", _
                    "Mobiltlf." _
                )
            Output2DArray .Cells(2, 1), aData
            .Columns.AutoFit
            .Rows.AutoFit
        End With
        MsgBox "Completed"
    
    End Sub
    
    Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
    
        Dim aHeader
    
        'With CreateObject("MSXML2.ServerXMLHTTP")
            '.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        With CreateObject("MSXML2.XMLHTTP")
            .Open sMethod, sUrl, False
            If IsArray(aSetHeaders) Then
                For Each aHeader In aSetHeaders
                    .SetRequestHeader aHeader(0), aHeader(1)
                Next
            End If
            .Send (sFormData)
            sRespHeaders = .GetAllResponseHeaders
            sRespText = .ResponseText
        End With
    
    End Sub
    
    Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
    
        Dim aTmp0
        Dim vItem
    
        ' Escape RegEx special characters
        For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
            sName = Replace(sName, vItem, "\" & vItem)
        Next
        ' Extract the whole <select> for parameter
        ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
        ' Extract each parameter <option>
        ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
        ' Put each parameter and value into dictionary
        Set oOptions = CreateObject("Scripting.Dictionary")
        For Each vItem In aTmp0
            oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
        Next
    
    End Sub
    
    Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
    
        Dim oMatch
        Dim aTmp0()
        Dim sSubMatch
    
        If Not (IsArray(aData) And bAppend) Then aData = Array()
        With CreateObject("VBScript.RegExp")
            .Global = bGlobal
            .MultiLine = bMultiLine
            .IgnoreCase = bIgnoreCase
            .Pattern = sPattern
            For Each oMatch In .Execute(sResponse)
                If oMatch.SubMatches.Count = 1 Then
                    PushItem aData, oMatch.SubMatches(0)
                Else
                    If bNestSubMatches Then
                        aTmp0 = Array()
                        For Each sSubMatch In oMatch.SubMatches
                            PushItem aTmp0, sSubMatch
                        Next
                        PushItem aData, aTmp0
                    Else
                        For Each sSubMatch In oMatch.SubMatches
                            PushItem aData, sSubMatch
                        Next
                    End If
                End If
            Next
        End With
    
    End Sub
    
    Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
    
        If Not (IsArray(aData) And bAppend) Then aData = Array()
        ReDim Preserve aData(UBound(aData) + 1)
        aData(UBound(aData)) = vItem
    
    End Sub
    
    Function DecodeUriComponent(sEncoded As String) As String
    
        Static objHtmlfile As Object
    
        If objHtmlfile Is Nothing Then
            Set objHtmlfile = CreateObject("htmlfile")
            objHtmlfile.parentWindow.execScript "function decode(s) {return decodeURIComponent(s)}", "jscript"
        End If
        DecodeUriComponent = objHtmlfile.parentWindow.decode(sEncoded)
    
    End Function
    
    Function GetInnerText(sText As String) As String
    
        Static oHtmlfile As Object
        Static oDiv As Object
    
        If oHtmlfile Is Nothing Then
            Set oHtmlfile = CreateObject("htmlfile")
            oHtmlfile.Open
            Set oDiv = oHtmlfile.createElement("div")
        End If
        oDiv.innerHTML = sText
        GetInnerText = oDiv.innerText
    
    End Function
    
    Function Denestify(aRows)
    
        Dim aData()
        Dim aItems()
        Dim i As Long
        Dim j As Long
    
        If UBound(aRows) = -1 Then Exit Function
        ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
        For j = 0 To UBound(aRows)
            If IsArray(aRows(j)) Then
                aItems = aRows(j)
                For i = 0 To UBound(aItems)
                    If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                    aData(j + 1, i + 1) = aItems(i)
                Next
            Else
                aData(j + 1, 1) = aRows(j)
            End If
        Next
        Denestify = aData
    
    End Function
    
    Sub OutputArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")
    
        With oDstRng
            .Parent.Select
            With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
                .NumberFormat = sFormat
                .Value = aCells
            End With
        End With
    
    End Sub
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = sFormat
                .Value = aCells
            End With
        End With
    
    End Sub
    

    There are 4896 entries total for 4689 lawyers:

    output

    UPDATE 2

    Seems to get complete list you may just make search with set (space) as Firma parameter: http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20, there are 6511 entries at the moment. The Sub Test() code for parse that results should be changed then as shown below:

    Option Explicit
    
    Sub Test()
    
        Dim sResponse As String
        Dim aTmp
        Dim aData
        Dim lPage As Long
        Dim i As Long
        Dim j As Long
    
        lPage = 0
        ' Process each results page
        Do
            Debug.Print vbTab & "Page " & lPage
            ' Retrieve results page
            XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20&p=" & lPage, "", "", "", sResponse
            ' Extract table
            ParseResponse _
                "<table\b[^>]*?id=""ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                sResponse, _
                aTmp, _
                False
            ' Extract data from the table
            ParseResponse _
                "<tr.*?onclick=""location.href=&#39;(.*?)&#39;"">\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "</tr>", _
                aTmp(0), _
                aData, _
                True
            Debug.Print vbTab & "Parsed " & (UBound(aData) + 1)
            lPage = lPage + 1
            DoEvents
        Loop Until InStr(sResponse, "<a class=""next""") = 0
        ' Retrieve detailed info for each entry
        For i = 0 To UBound(aData)
            aTmp = aData(i)
            ' Retrieve details page
            aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
            ' Extract details
            Do
                XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
                If InStr(sResponse, "<title>Runtime Error</title>") = 0 Then Exit Do
                DoEvents
            Loop
            ParseResponse _
                DecodeUriComponent( _
                    "Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
                    "Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
                    "(:?F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?)?" & _
                    "M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
                    "M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
                    "(:?E-mail [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?)?" & _
                    "Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
                sResponse, _
                aTmp, _
                True, _
                False
            aTmp(8) = StrReverse(aTmp(8))
            aData(i) = aTmp
            Debug.Print vbTab & "Details " & i
            DoEvents
        Next
        ' Rebuild nested arrays to 2d array for output
        aData = Denestify(aData)
        ' Decode HTML
        For i = 1 To UBound(aData, 1)
            For j = 2 To 4
                aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
            Next
        Next
        ' Output
        With ThisWorkbook.Sheets(1)
            .Cells.Delete
            OutputArray .Cells(1, 1), _
                Array("URL", _
                    "Navn", _
                    "Firma", _
                    DecodeUriComponent("Arbejdsomr%C3%A5der"), _
                    DecodeUriComponent("Beskikkelses%C3%A5r"), _
                    DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
                    DecodeUriComponent("M%C3%B8deret for landsret"), _
                    DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
                    "E-mail", _
                    "Mobiltlf." _
                )
            Output2DArray .Cells(2, 1), aData
            .Columns.AutoFit
            .Rows.AutoFit
        End With
        MsgBox "Completed"
    
    End Sub