This question is part of a small series I have posted to try and webscrape brief profiles
of https://echa.europa.eu/information-on-chemicals
The code uses the Public function GetUrl
() to retrieve the url of the desired brief profile. This is then used but the SubRoutine GetContents() to scrape the desired data for physical and chemical properties.
Puzzulingly I get a runtime error 91. This is strange because both GetContents() and GetUrl() Work when independent of one another.
Is someone wouldn't mind taking a look that would be great.
Sub GetContents()
Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
Url = GetUrl()
xmlReq.Open "Get", Url, False
xmlReq.send
If xmlReq.Status <> 200 Then
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = xmlReq.responseText
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
Set SubSects = SubSectList.getElementsByTagName("dt")
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect
End Sub
Public Function GetUrl() As String
Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
Set oHtml = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
Set Ws = ThisWorkbook.Worksheets("Sheet1")
'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
searchKeyword = "Acetone" '
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
Next DictKey
With oHttp
.Open "POST", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send (payload)
oHtml.body.innerHTML = .responseText
End With
Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
GetUrl = oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
End Function
References:
Update: What's particularly strange is that the following code works when given an exact url in the code:
Sub GetContents()
Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send
If xmlReq.Status <> 200 Then
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = xmlReq.responseText
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
Set SubSects = SubSectList.getElementsByTagName("dt")
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect
End Sub
However Replacing
xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send
with
Url = GetUrl()
xmlReq.Open "Get", url, False
xmlReq.send
where GetUrl() refers to the working Public funtion as above
causes the code to break.. with Set SubSects = SubSectList.getElementsByTagName("dt")
as the problematic line when debugging.
Update Screenshot of result when code provided in answer is run:
You are extracting the wrong url, and there are no dt elements in the html of that URI. Change the css selector and simplify as follows:
Option Explicit
Public Sub GetContents()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
XMLReq.Open "Get", GetUrl, False
XMLReq.send
HTMLDoc.body.innerHTML = XMLReq.responseText
Dim i As Long
With HTMLDoc.querySelectorAll(".EndpointContent dt")
For i = 0 To .Length - 1
Debug.Print .Item(i).innerText & " : " & .Item(i).NextSibling.NextSibling.innerText
Next
End With
End Sub
Public Function GetUrl() As String
Const url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, i&, R&
Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
Set oHtml = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
Set Ws = ThisWorkbook.Worksheets("Sheet1")
'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
searchKeyword = "Acetone" '
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
Next DictKey
With oHttp
.Open "POST", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send (payload)
oHtml.body.innerHTML = .responseText
End With
Debug.Print oHtml.querySelector(".briefProfileLink").getAttribute("href")
GetUrl = oHtml.querySelector(".briefProfileLink").getAttribute("href")
End Function