Search code examples
htmlexcelvbaweb-scrapingscreen-scraping

How to Isolate multiple innertext entries when using get elementbyID


I'm trying to isolate 2 different innerText strings from a webpage, but cannot single them out. The innerText for all the tags comes as a whole. The date and season number are the issue.

I am using getElementById and this gives me a single element. The div with id "next_episode" has what looks like 2 different entries for inner text that I'm interested in. When I loop through the inner text of its children, these 2 entries are skipped. I can't figure out how to isolate the 2 different innerText entries of just the "next_episode" tag. I'm isolating the text I require by using the index number in the arrays my code returns.

Dim IE_00 As SHDocVw.InternetExplorer
Dim HTMLDoc_00 As MSHTML.HTMLDocument
Set IE_00 = New SHDocVw.InternetExplorer
IE_00.Visible = True

IE_00.navigate "https://next-episode.net/final-space"
Do While IE_00.readyState <> READYSTATE_COMPLETE
Loop
Set HTMLDoc_00 = IE_00.document

Dim NETC_05 As MSHTML.IHTMLElementCollection
Dim NET_05 As MSHTML.IHTMLElement

'Can loop through the inner text of the children one by one and find what 
I need

Set NETC_05 = HTMLDoc_00.getElementById("next_episode").Children

For Each NET_05 In NETC_05
Debug.Print NET_05.innerText
Next NET_05

'This just gives a big block of text that includes the missing inner text 
I need

Set NET_05 = HTMLDoc_00.getElementById("next_episode")
Debug.Print NET_05.innerText

Solution

  • 'Setting XML 05 as an Object
        Dim XML_05 As New MSXML2.XMLHTTP60
    'Setting HTML Document 05 as an Object
        Dim HTML_05 As New MSHTML.HTMLDocument
    
        XML_05.Open "GET", Cells(Row, NextEpisodeURL).Value, False
        XML_05.send
        HTML_05.body.innerHTML = XML_05.responseText
    
    'Setting Net Element Tag Collection 05 as an Object
        Dim NETC_05 As MSHTML.IHTMLElementCollection
    'Setting Net Element Tag 05 as an Object
        Dim NET_05 As MSHTML.IHTMLElement
    'Setting Reg EX 05 as an Object
        Dim REO_05 As VBScript_RegExp_55.RegExp
    'Setting Match Object 05 as Object
        Dim MO_05 As Object
    'Setting Season array as Array
        Dim SN_05() As String
    'Setting Episode Name 05 as Array
        Dim ENA_05() As String
    'Setting Episode Number 05 as Array
        Dim EN_05() As String
    
    'Getting Episode Name Episode Number and Season Number From Net
    
    'Set NETC_05 = HTML_05.getElementsByClassName("sub_main")
        Set NET_05 = HTML_05.getElementById("previous_episode")
        Set REO_05 = New VBScript_RegExp_55.RegExp
            REO_05.Global = True
            REO_05.IgnoreCase = True
    
    'Getting Episode Name
        REO_05.Pattern = "(Name:(.*))"
            Set MO_05 = REO_05.Execute(NET_05.innerText)
                Debug.Print MO_05.Count
                Debug.Print MO_05(0).Value
                    ENA_05 = Split(MO_05(0), ":")
                Debug.Print ENA_05(1)
                Cells(Row, NextEpName).Value = ENA_05(1)
        
    'Getting Episode Number
        REO_05.Pattern = "(Episode:([0-9]*))"
            Set MO_05 = REO_05.Execute(NET_05.innerText)
                Debug.Print MO_05.Count
                Debug.Print MO_05(0).Value
                    EN_05 = Split(MO_05(0), ":")
                Debug.Print EN_05(1)
                Cells(Row, EpisodeNet).Value = EN_05(1)
        
    'Getting Season Number
        REO_05.Pattern = "(Season:([0-9]*))"
            Set MO_05 = REO_05.Execute(NET_05.innerText)
                Debug.Print MO_05.Count
                Debug.Print MO_05(0).Value
                    SN_05 = Split(MO_05(0), ":")
                Debug.Print SN_05(1)
                Cells(Row, SeasonNet).Value = SN_05(1)
        
    'Getting Countdown From Net
        Set NETC_05 = HTML_05.getElementById("next_episode").Children
            Cells(Row, Countdown).Value = NETC_05(5).innerText
            Debug.Print NETC_05(5).innerText
    
    Public Sub GetShowInfo()
        Dim html As MSHTML.HTMLDocument, headers(), i As Long, aCollection As Object, info As Object
    
        Set html = New HTMLDocument
    
        With CreateObject("Msxml2.xmlhttp")
            .Open "GET", "https://next-episode.net/chicago-fire", False
            .send
            html.body.innerHTML = .responseText
        End With
    
        Set aCollection = html.getElementById("next_episode").getElementsByTagName("div")
        Set aCollection = html.getElementById("next_episode").getElementsByClassName("subheadline")
            On Error Resume Next
            For Each ele In aCollection
                Debug.Print ele.innerText
                Debug.Print ele.outerText
                Debug.Print ele.nextElementSibling.innerText
                Debug.Print ele.nextElementSibling.innerText
            Next ele
       
    End Sub