Search code examples
excelvbaweb-scrapingscreen-scraping

Ebay Scraper, missing date for first line and then evey loop


I am having issues with my eBAY Scraper and can not work out why. Although it is pulling the data off fine, it misses SOME of the data OFF for the first row and then for each first row of every Loop and therefore the data is not in the correct row.

Q) Why is it missing the data at the start and then for each loop?

I think It may have something to do with the title extracting slower that the rest of the items, however I can not work it out as I am very limited with vba. I have attached a demo, for your viewing.

I am not looking for a full rewite of the code, just pointing in the right direction or a SLIGHT change to MY code. As I stated I and very limited in vba, I can understand my code, anything more advanced will be out of my depth.

Demo Download - Download Excel File

WebSite - Ebay.co.uk

Ebay Product Page - Prodcts Shown may vary browser to browser

I have colour coded it so you can see better

This is what it is doing Image1 Error

When It Should be This Image 2 Correct

For some reason it misses out Price, Condition, Former Price & Discount for the first item on start and EVERY Loop. For every loop that it misses the items out the Price, Condition, Former Price & Discount become MORE out of line

1st Loop - Items are NOW 2 rows out of line Loop1

2nd Loop - Items are NOW 3 rows out of line Loop2

As I searched 3 pages (2 pages + 1 extra) and it looped 3 time it has missed the first row on each loop. I am 3 rows out. I think this may have too do with the Title of the item as it extracts a bit slower then the rest of the items

End Of Extraction End Results

This is my code

Dim HTML As HTMLDocument
Dim objIE As Object
Dim result As String
Dim pageNumber As Long ' page no.
Dim nextPageElement As Object 'page element
Dim HtmlText As Variant
Dim wsSheet As Worksheet
Dim wb As Workbook
Dim sht As Worksheet

    Set wb = ThisWorkbook
    Set wsSheet = wb.Sheets("Sheet1")
    Set sht = ThisWorkbook.Worksheets("Sheet1")

'+++++ Internet Explorer ++++++
    Set objIE = New InternetExplorer
        objIE.Visible = False
        objIE.navigate Sheets("Sheet1").Range("A2").Value & Replace(Worksheets("Sheet1").Range("B2") & Range("C2").Value, " ", "+") 'navigate IE to this web page

    Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop 'wait here a few seconds while the browser is busy
    Set ie = CreateObject("InternetExplorer.Application")

'######################################## RESTART CODE FROM HERE ####################################
StartForLoop_Restart: 'Restart the code HERE, this is the key part
'######################################## RESTART CODE FROM HERE ####################################

    '  Application.ScreenUpdating = False
        Set HTML = objIE.document
        Set elements = HTML.getElementsByClassName("s-item__wrapper clearfix") ' parent CLASS
        'FOR LOOP
        For Each element In elements
    
''' Element 1
        If element.getElementsByClassName("s-item__link")(0) Is Nothing Then
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = "-"
        Else
            HtmlText = element.getElementsByClassName("s-item__link")(0).href
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = HtmlText
        End If
''' Element 2
        If element.getElementsByClassName("s-item__link")(0) Is Nothing Then
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "B").End(xlUp).Row + 1, "B").Value = "-"
        Else
            HtmlText = element.getElementsByClassName("s-item__link")(0).innerText  'src
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "B").End(xlUp).Row + 1, "B").Value = HtmlText
        End If
''' Element 3
        If element.getElementsByClassName("s-item__price")(0) Is Nothing Then
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "C").End(xlUp).Row + 1, "C").Value = "-"
        Else
            HtmlText = element.getElementsByClassName("s-item__price")(0).innerText
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "C").End(xlUp).Row + 1, "C").Value = HtmlText
        End If
''' Element 4
        If element.getElementsByClassName("SECONDARY_INFO")(0) Is Nothing Then
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "D").End(xlUp).Row + 1, "D").Value = "-"
        Else
            HtmlText = element.getElementsByClassName("SECONDARY_INFO")(0).innerText
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "D").End(xlUp).Row + 1, "D").Value = HtmlText
        End If
''' Element 5
        If element.getElementsByClassName("STRIKETHROUGH")(0) Is Nothing Then
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "E").End(xlUp).Row + 1, "E").Value = "-"
        Else
            HtmlText = element.getElementsByClassName("STRIKETHROUGH")(0).innerText
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "E").End(xlUp).Row + 1, "E").Value = HtmlText
        End If
''' Element 6
        If element.getElementsByClassName("s-item__discount s-item__discount")(0) Is Nothing Then
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "F").End(xlUp).Row + 1, "F").Value = "-"
        Else
            HtmlText = element.getElementsByClassName("s-item__discount s-item__discount")(0).innerText
            wsSheet.Cells(sht.Cells(sht.Rows.Count, "F").End(xlUp).Row + 1, "F").Value = HtmlText
        End If
   ' Application.ScreenUpdating = True
    Next element

    Do
    'Searches Number of Pages entered in
    If pageNumber >= Replace(Worksheets("Sheet1").Range("d2").Value, "", "+") Then Exit Do
        Set nextPageElement = HTML.getElementsByClassName("pagination__next")(0) ' CLICK TO NEXT PAGE
    If nextPageElement Is Nothing Then Exit Do
        objIE.document.parentWindow.Scroll 0&, 99999 ' Scrolls Down the Browser
        nextPageElement.Click 'next web page
    Do While objIE.Busy = True Or objIE.readyState <> 4
   
    Loop
        Set HTML = objIE.document
        pageNumber = pageNumber + 1

'##################################### Restart Loop ##################################
 GoTo StartForLoop_Restart ' use GOTo command and label to reinitiate the CODE AS WOULD NOT LOOP TO NEXT PAGE
'##################################### Restart Loop ##################################

     Loop

        objIE.Quit ' end and clear browser
            Set objIE = Nothing
            Set HTML = Nothing
            Set nextPageElement = Nothing
            Set HtmlText = Nothing
            Set element = Nothing

End Sub

As always thanks in advance.


Solution

  • Make sure to skip the first element within your returned collection. Keeping to your code.

    Private Sub CommandButton1_Click()
    
        Dim HTML As HTMLDocument
        Dim objIE As Object
        Dim result As String
        Dim pageNumber As Long                       ' page no.
        Dim nextPageElement As Object                'page element
        Dim HtmlText As Variant
        Dim wsSheet As Worksheet
        Dim wb As Workbook
        Dim sht As Worksheet
    
        Set wb = ThisWorkbook
        Set wsSheet = wb.Sheets("Sheet1")
        Set sht = ThisWorkbook.Worksheets("Sheet1")
    
        '+++++ Internet Explorer ++++++
        Set objIE = New InternetExplorer
        objIE.Visible = True
        objIE.navigate Sheets("Sheet1").Range("A2").Value & Replace(Worksheets("Sheet1").Range("B2") & Range("C2").Value, " ", "+") 'navigate IE to this web page
    
        Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop 'wait here a few seconds while the browser is busy
        'Set ie = CreateObject("InternetExplorer.Application")
    
        '######################################## RESTART CODE FROM HERE ####################################
    StartForLoop_Restart:                            'Restart the code HERE, this is the key part
        '######################################## RESTART CODE FROM HERE ####################################
    
        '  Application.ScreenUpdating = False
        Set HTML = objIE.document
        Set elements = HTML.getElementsByClassName("s-item__wrapper clearfix") ' parent CLASS
        'FOR LOOP
        Dim counter As Long
        counter = 0
        
        For Each element In elements
            If counter > 0 Then
            ''' Element 1
            If element.getElementsByClassName("s-item__link")(0) Is Nothing Then
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = "-"
            Else
                HtmlText = element.getElementsByClassName("s-item__link")(0).href
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = HtmlText
            End If
            ''' Element 2
            If element.getElementsByClassName("s-item__link")(0) Is Nothing Then
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "B").End(xlUp).Row + 1, "B").Value = "-"
            Else
                HtmlText = element.getElementsByClassName("s-item__link")(0).innerText 'src
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "B").End(xlUp).Row + 1, "B").Value = HtmlText
            End If
            ''' Element 3
            If element.getElementsByClassName("s-item__price")(0) Is Nothing Then
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "C").End(xlUp).Row + 1, "C").Value = "-"
            Else
                HtmlText = element.getElementsByClassName("s-item__price")(0).innerText
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "C").End(xlUp).Row + 1, "C").Value = HtmlText
            End If
            ''' Element 4
            If element.getElementsByClassName("SECONDARY_INFO")(0) Is Nothing Then
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "D").End(xlUp).Row + 1, "D").Value = "-"
            Else
                HtmlText = element.getElementsByClassName("SECONDARY_INFO")(0).innerText
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "D").End(xlUp).Row + 1, "D").Value = HtmlText
            End If
            ''' Element 5
            If element.getElementsByClassName("STRIKETHROUGH")(0) Is Nothing Then
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "E").End(xlUp).Row + 1, "E").Value = "-"
            Else
                HtmlText = element.getElementsByClassName("STRIKETHROUGH")(0).innerText
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "E").End(xlUp).Row + 1, "E").Value = HtmlText
            End If
            ''' Element 6
            If element.getElementsByClassName("s-item__discount s-item__discount")(0) Is Nothing Then
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "F").End(xlUp).Row + 1, "F").Value = "-"
            Else
                HtmlText = element.getElementsByClassName("s-item__discount s-item__discount")(0).innerText
                wsSheet.Cells(sht.Cells(sht.Rows.Count, "F").End(xlUp).Row + 1, "F").Value = HtmlText
            End If
            ' Application.ScreenUpdating = True
            End If
            counter = counter + 1
        Next element
        
        Do
            'Searches Number of Pages entered in
            If pageNumber >= Replace$(Worksheets("Sheet1").Range("d2").Value, "", "+") Then Exit Do
            
            Set nextPageElement = HTML.getElementsByClassName("pagination__next")(0) ' CLICK TO NEXT PAGE
            
            If nextPageElement Is Nothing Then Exit Do
            
            objIE.document.parentWindow.Scroll 0&, 99999 ' Scrolls Down the Browser
            nextPageElement.Click                    'next web page
            
            Do While objIE.Busy = True Or objIE.readyState <> 4
            DoEvents
            Loop
            
            Set HTML = objIE.document
            
            pageNumber = pageNumber + 1
            counter = 0
            '##################################### Restart Loop ##################################
            GoTo StartForLoop_Restart                ' use GOTo command and label to reinitiate the CODE AS WOULD NOT LOOP TO NEXT PAGE
            '##################################### Restart Loop ##################################
    
        Loop
    
        objIE.Quit                                   ' end and clear browser
        Set objIE = Nothing
        Set HTML = Nothing
        Set nextPageElement = Nothing
        Set HtmlText = Nothing
        Set element = Nothing
    
    End Sub