I'm trying to scrape the href of each model from this webpage: https://www.aprilia.com/en_EN/index.
The html showing the href data can be seen only after clicking on two buttons (the one on the top right corner and the one on the left called "Models"), one after the other.
Sub get_info()
Dim ie As Object
Dim address, str_chk As String
Dim my_data As Object
Dim oHTML_Element As IHTMLElement
Dim i As Long
address = "https://www.aprilia.com/en_EN/index"
Set ie = CreateObject("InternetExplorer.Application")
ie.navigate address 'the one mentioned above
ie.Visible = False
Do While ie.Busy Or ie.readyState <> 4
DoEvents
Loop
For Each oHTML_Element In ie.document.getElementsByName("button")
If oHTML_Element.className = "header__menu-services__nav button button--icon" Then
oHTML_Element.Click
End If
Next
Application.Wait Now + #12:00:05 AM#
For Each oHTML_Element In ie.document.getElementsByName("Models")
oHTML_Element.Click
Next
Application.Wait Now + #12:00:05 AM#
'==>
Set my_data = html.getElementsByClassName("card-product card-product--family")
For Each elem In my_data
For i = 0 To elem.getElementsByTagName("a").Length - 1
str_chk = elem.getElementsByTagName("a")(i).href
ws.Cells(9 + j, 7).Value = str_chk
j = j + 1
Next i
Next elem
ie.Quit
Set ie = Nothing
End Sub
I got
Error '424' - Object Required
where I set my_data
.
I guess that means that I'm not able to click on the two buttons and, as a consequence, html code is not available.
***************** Revised code:
Sub get_info22()
Dim address As String
Dim ie, ELE, nodes As Object
Dim i As Long
Dim t As Date
Const MAX_WAIT_SEC As Long = 10
address = "https://www.aprilia.com/en_EN/index"
Set ie = CreateObject("InternetExplorer.Application")
ie.navigate address 'the one mentioned above
ie.Visible = False
Do While ie.Busy Or ie.readyState <> 4
DoEvents
Loop
'************** click on first button
t = Timer
Do
On Error Resume Next
Set ELE = ie.document.querySelector(".header__menu-services__nav")
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While ELE Is Nothing
If Not ELE Is Nothing Then
ELE.Click
End If
While ie.Busy Or ie.readyState <> 4: DoEvents: Wend
'************** click on second button
Do
On Error Resume Next
Set ELE = ie.document.querySelector("li > button")
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While ELE Is Nothing
If Not ELE Is Nothing Then
ELE.Click
End If
While ie.Busy Or ie.readyState <> 4: DoEvents: Wend
'************** get href for each model
Set nodes = ie.document.querySelectorAll(".card-product--family")
ReDim hrefs(nodes.Length - 1)
For i = 0 To nodes.Length - 1
hrefs(i) = nodes.Item(i).href
ActiveSheet.Cells(9 + i, 7).Value = hrefs(i)
Next
Try first to use more precise selectors. For the first button use:
ie.document.querySelector(".header__menu-services__nav").click
That targets the element by one of its classes. Then have a pause e.g.
While ie.Busy Or ie.ReadyState<>4:DoEvents:Wend
Or, use an explicit wait time or loop until next desired element is present.
Then target the next element with type selectors and child combinator as you want the first child button
within a li
element:
ie.document.querySelector("li > button").click
Then you need another wait.
Finally, you can use a single class from the target elements, with the links, and extract the href
attributes and store in an array (for example)
Dim nodes As Object, hrefs(), i As Long
Set nodes = ie.Document.querySelectorAll(".card-product")
ReDim hrefs(nodes.Length - 1)
For i = 0 To nodes.Length - 1
hrefs(i) = nodes.Item(i).href
Next
EDIT:
Seems page uses ajax to retrieve the listings which makes this easier. I show to versions. The first where I grab just those links you describe after two button clicks; the second, where I grab model subtype links as well.
In both I mimic the request the page makes to get that info. In the first I then parse the returned json with a json parser and pull out the model links. With the second, I regex out all href info ie. all submodels.
Json library:
I use jsonconverter.bas. Download raw code from here and add to standard module called JsonConverter . You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
1)
Option Explicit
Public Sub ScrapeModelLinks1()
Dim data As Object, links() As Variant, s As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.aprilia.com/en_EN/aprilia/en/index?ajax=true", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
End With
Set data = JsonConverter.ParseJson(s)("pageData")("main")("component-06")("items")
ReDim links(data.Count)
Dim item As Long, base As String
base = "https://www.aprilia.com"
For item = 1 To data.Count
links(item) = base & data(item)("href")
Next
Stop
End Sub
Public Sub ScrapeModelLinks2()
'grab all href which will include model subtypes
Dim s As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.aprilia.com/en_EN/aprilia/en/index?ajax=true", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
End With
Dim re As Object, matches As Object, links() As Variant
Set re = CreateObject("VBScript.RegExp")
re.Pattern = """href"":""(.*?)"""
re.Global = True
Set matches = re.Execute(s)
ReDim links(matches.Count - 1)
Dim item As Long, base As String
base = "https://www.aprilia.com"
For item = 0 To matches.Count - 1
links(item) = base & matches(item).submatches(0)
Next
Stop
End Sub
Regex explanation: