Search code examples
excelvbaweb-scrapingxmlhttprequest

Sub and Function Work Independently but not together


This question is part of a small series I have posted to try and webscrape brief profiles of https://echa.europa.eu/information-on-chemicals

The code uses the Public function GetUrl() to retrieve the url of the desired brief profile. This is then used but the SubRoutine GetContents() to scrape the desired data for physical and chemical properties.

Puzzulingly I get a runtime error 91. This is strange because both GetContents() and GetUrl() Work when independent of one another.

Is someone wouldn't mind taking a look that would be great.

        Sub GetContents()

Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
            
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
                    
Url = GetUrl()
                    
xmlReq.Open "Get", Url, False
xmlReq.send
            
           
If xmlReq.Status <> 200 Then
            
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub

End If
            
HTMLDoc.body.innerHTML = xmlReq.responseText
            
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
            
Set SubSects = SubSectList.getElementsByTagName("dt")
              
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect

End Sub


Public Function GetUrl() As String
        Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
        Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
        Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
        
        Set oHtml = New HTMLDocument
        Set oHttp = CreateObject("MSXML2.XMLHTTP")
        Set MyDict = CreateObject("Scripting.Dictionary")
        Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
        searchKeyword = "Acetone" '
        
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
        MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
        MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
        payload = ""
        For Each DictKey In MyDict
            payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
            payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
        Next DictKey
        
        With oHttp
            .Open "POST", Url, False
            .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
            .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
            .send (payload)
            oHtml.body.innerHTML = .responseText
        End With
        
        Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
        GetUrl = oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
    
    End Function

References:

enter image description here

Update: What's particularly strange is that the following code works when given an exact url in the code:

Sub GetContents()

Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
            
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
                    
xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send
            
            
If xmlReq.Status <> 200 Then
            
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub

End If
            
HTMLDoc.body.innerHTML = xmlReq.responseText
            
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
            
Set SubSects = SubSectList.getElementsByTagName("dt")

            
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect

End Sub

However Replacing

xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send

with

Url = GetUrl()
xmlReq.Open "Get", url, False
    xmlReq.send

where GetUrl() refers to the working Public funtion as above

causes the code to break.. with Set SubSects = SubSectList.getElementsByTagName("dt") as the problematic line when debugging.

Update Screenshot of result when code provided in answer is run: enter image description here


Solution

  • You are extracting the wrong url, and there are no dt elements in the html of that URI. Change the css selector and simplify as follows:

    Option Explicit
    
    Public Sub GetContents()
        Dim XMLReq As New MSXML2.XMLHTTP60
        Dim HTMLDoc As New MSHTML.HTMLDocument
            
        XMLReq.Open "Get", GetUrl, False
        XMLReq.send
                      
        HTMLDoc.body.innerHTML = XMLReq.responseText
        
        Dim i As Long
        
        With HTMLDoc.querySelectorAll(".EndpointContent dt")
            For i = 0 To .Length - 1
                Debug.Print .Item(i).innerText & " : " & .Item(i).NextSibling.NextSibling.innerText
            Next
        End With
    End Sub
    
    Public Function GetUrl() As String
        Const url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
        Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, i&, R&
        Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
            
        Set oHtml = New HTMLDocument
        Set oHttp = CreateObject("MSXML2.XMLHTTP")
        Set MyDict = CreateObject("Scripting.Dictionary")
        Set Ws = ThisWorkbook.Worksheets("Sheet1")
        
        'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
        searchKeyword = "Acetone"                    '
            
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
        MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
        MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
        
        payload = ""
            
        For Each DictKey In MyDict
            payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
                          payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
        Next DictKey
            
        With oHttp
            .Open "POST", url, False
            .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
            .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
            .send (payload)
            oHtml.body.innerHTML = .responseText
        End With
            
        Debug.Print oHtml.querySelector(".briefProfileLink").getAttribute("href")
        GetUrl = oHtml.querySelector(".briefProfileLink").getAttribute("href")
        
    End Function