Search code examples
htmlexcelvba

How to get full and correct URL from element?


I'm trying to find a way to get the correct and full URL of a link.

Sometimes, with item.href, I get the full url and sometimes only shows about:something or about:..something or about:../something

In this case with my current code, I iterate all links in this URL and want to get the full URL of the link related with text "VLOOKUP". The answer I get with my current code is about:excel_vlookup.php and if I want to simply concatenate the base_url (https://www.w3schools.com/excel/index.php) with Replace("about:excel_vlookup.php","about:",""), I would get a wrong answer like this https://www.w3schools.com/excel/index.phpexcel_vlookup.php instead of https://www.w3schools.com/excel/excel_vlookup.php that is the correct full URL when I put mouse over that link.

May someone know how to do this?

Sub full_url()
Dim htmlDoc As New HTMLDocument
Dim links As Object
Dim i As Integer

    With New ServerXMLHTTP60
        .Open "Get", "https://www.w3schools.com/excel/index.php", False
        .send
        htmlDoc.body.innerHTML = .responseText
    End With
        
    Set links = htmlDoc.body.getElementsByTagName("a")
    
    With links
        For i = 0 To .Length - 1
            If .Item(i).innerText Like "*VLOOKUP*" Then
                Debug.Print .Item(i).href
            End If
        Next
    End With

End Sub

UPDATE Based on @taller's code, it works when href is like "about:something". Still need some tweak if href is a full url.

Sub full_url1()
    Dim htmlDoc As New HTMLDocument
    Dim links As Object
    Dim i As Integer, sUrl As String, aTxt, iCnt As Long
    Const PREFIX = "about:"
    sUrl = "https://www.w3schools.com/excel/index.php"
    aTxt = Split(sUrl, "/")
    iCnt = UBound(aTxt)
    With New ServerXMLHTTP60
        .Open "Get", sUrl, False
        .send
        htmlDoc.body.innerHTML = .responseText
    End With
    
    Set links = htmlDoc.body.getElementsByTagName("a")
    
    With links
        For i = 0 To .Length - 1
            If .Item(i).innerText Like "*Spaces*" Then
                href = .Item(i).href
                If InStr(1, href, "https://") = 1 Then
                    Debug.Print .Item(i).href
                Else
                    aTxt(iCnt) = Mid(href, Len(PREFIX) + 1)
                    Debug.Print Join(aTxt, "/")
                End If
                Exit For
            End If
        Next
    End With
End Sub

Solution

    • Split the URL into an array using "/" as the delimiter and replace the last segment.

    Microsoft documentation:

    Split function

    Join function

    Sub full_url()
        Dim htmlDoc As New HTMLDocument
        Dim links As Object
        Dim i As Integer, sUrl As String, aTxt, iCnt As Long
        Const PREFIX = "about:"
        sUrl = "https://www.w3schools.com/excel/index.php"
        aTxt = Split(sUrl, "/")
        iCnt = UBound(aTxt)
        With New ServerXMLHTTP60
            .Open "Get", sUrl, False
            .send
            htmlDoc.body.innerHTML = .responseText
        End With
        
        Set links = htmlDoc.body.getElementsByTagName("a")
        
        With links
            For i = 0 To .length - 1
                If .Item(i).innerText Like "*Spaces*" Then
                    href = .Item(i).href
                    If InStr(1, href, "https://") = 1 Then
                        Debug.Print .Item(i).href
                    ElseIf InStr(1, href, PREFIX) = 1 Then
                        aTxt(iCnt) = Mid(href, Len(PREFIX) + 1)
                        If Left(aTxt(iCnt), 1) = "/" Then aTxt(iCnt) = Mid(aTxt(iCnt), 2)
                        Debug.Print Join(aTxt, "/")
                    End If
                    Exit For
                End If
            Next
        End With
    End Sub