Search code examples
excelvbaweb-scrapingautomationxmlhttprequest

Converting Web Browser Automation to XmlHTTP Request


I have created a macro which scrapes relevant information from Brief profiles (BP) that can be searched for at: https://echa.europa.eu/information-on-chemicals

This works using an XMLHTTP request to the URL of the Brief Profile and works fine.

I now wish to create a macro which searches the same website to find the URL(href) of the brief profile.

As a beginner to VBA I have successfully achieved this using a browser but I wish to convert this to XML HTTP request to improve efficiency.

Using IE Browser Automation:

Sub Gethref()

Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
IE.Visible = True
IE.navigate "https://echa.europa.eu/information-on-chemicals"

'Check Website is ready for search and set HTMLDoc to IE.Document for elements
Do While IE.readyState <> READYSTATE_COMPLETE
Loop

Set HTMLDoc = IE.document

'Set value of Searchbox to keyword
Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
HTMLInput.Value = "Potassium mercaptoacetate"

'Search for Result
Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
HTMLButton.Click

'Check page has loaded
Do While IE.readyState = READYSTATE_COMPLETE or IE.Busy
Loop
Set HTMLDoc = IE.document

'Find Desired href
Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
Debug.Print HTMLhref.getAttribute("href")

End Sub

This should print the href for Potassium mercaptoacetate as https://echa.europa.eu/brief-profile/-/briefprofile/100.000.602

I have started attempted to convert as much as I can using XML HTTP but Im running into issues which I dont quite understand

Using XML HTTP Request (Not working)

Sub Gethref()

    Dim XMLPage As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLInput As MSHTML.IHTMLElement
    Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
    XMLPage.Open "GET", "https://echa.europa.eu/information-on-chemicals", False
    XMLPage.send
    
'Set value of Searchbox to keyword
    Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
    HTMLInput.Value

'Search for Result
    Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
    HTMLButton.Click

'Check page has loaded
HTMLDoc.body.innerHTML = IE.document.responseText

'Find Desired href
    Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
    Debug.Print HTMLhref.getAttribute("href")

End Sub

I will update as I make progress with this but if anyone can offer help it will be great.


Solution

  • Okay, this should do it. Turn out that you need to issue post http requests with appropriate parameters to get required response containing desired links.

    Public Sub GetContent()
        Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
        Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
        Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
        
        Set oHtml = New HTMLDocument
        Set oHttp = CreateObject("MSXML2.XMLHTTP")
        Set MyDict = CreateObject("Scripting.Dictionary")
        Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
        searchKeyword = "Acetone"
        
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
        MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
        MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
        payload = ""
        For Each DictKey In MyDict
            payload = IIf(Len(DictKey) = 0, WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)), _
            payload & "&" & WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)))
        Next DictKey
        
        With oHttp
            .Open "POST", Url, False
            .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
            .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
            .send (payload)
            oHtml.body.innerHTML = .responseText
        End With
        
        With oHtml.querySelectorAll("table.table > tbody > tr > td > a.substanceNameLink")
            For I = 0 To .Length - 1
                R = R + 1: Ws.Cells(R, 1) = .item(I).getAttribute("href")
            Next I
        End With
    End Sub
    

    If you are interested in the first link only, try the following instead of the last with block:

    MsgBox oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
    

    Or you can directly copy those parameters from dev tool and use them:

    Public Sub GetContent()
        Const Url = "https://echa.europa.eu/search-for-chemicals?"
        Dim oHttp As Object, oHtml As HTMLDocument
        Dim payload$, Ws As Worksheet, urlSuffix$
        
        Set oHtml = New HTMLDocument
        Set oHttp = CreateObject("MSXML2.XMLHTTP")
        Set Ws = ThisWorkbook.Worksheets("Sheet1")
        
        urlSuffix = "p_auth=69hDou3E&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=" & _
                    "_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=" & _
                    "doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals" & _
                    "%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview" & _
                    "%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
    
        payload = "_disssimplesearchhomepage_WAR_disssearchportlet_formDate=1621042609544&_disssimplesearch_WAR_disssearchportlet_searchOccurred=" & _
                  "true&_disssimplesearch_WAR_disssearchportlet_sskeywordKey=Acetone&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer" & _
                  "=true&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox=on"
        
        With oHttp
            .Open "POST", Url & urlSuffix, False
            .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
            .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
            .send (payload)
            oHtml.body.innerHTML = .responseText
        End With
        
        Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
    End Sub