Getting Website Info Without Using a WebBrowser Control

In my original post:

I demonstrated how we could use Internet Explorer automation to get information back from any site.

Then I demonstrated how you could do the same using the WebBrowser control in my article

Now these techniques are great if you want user interaction and them give them control over certain aspects as they can interact with IE/webbrowser control.

But what if you don’t need any user interaction?

There is another, simple, approach that can do exactly that all in VBA without any need for Internet Explorer or a form/webbrowser control!

Let’s dive in!

Using The MSHTML createDocumentFromUrl Method

This approach is quite simple, we create an HTMLDocument object and then use it to execute the createDocumentFromUrl method to create another HTMLDocument object that we can work with.  Sounds complicated, but it truly isn’t.

Here’s what is looks like in practice:

'Requires Reference to: Microsoft HTML Object Library
Sub MSHTML_GetURLInfo()
    Dim oHTMLDoc              As MSHTML.HTMLDocument
    Dim oIHTMLDoc             As MSHTML.IHTMLDocument2
    Dim oURLHTMLDoc           As MSHTML.HTMLDocument
    Dim i                     As Long
    Const sURL                As String = "https://www.devhut.net/avoiding-the-followhyperlink-security-warning/"

    Set oHTMLDoc = New MSHTML.HTMLDocument
    Set oIHTMLDoc = oHTMLDoc.createDocumentFromUrl(sURL, vbNullString)
    Do Until oIHTMLDoc.ReadyState = "complete"
        DoEvents
    Loop

    '    oIHTMLDoc.title                                'Page Title
    '    oIHTMLDoc.location                             'URL
    Set oURLHTMLDoc = oIHTMLDoc.body.Document

    '    oURLHTMLDoc.documentElement.outerHTML          'Full Page source

    'Extract all the META tag content
    For i = 0 To oURLHTMLDoc.getElementsByTagName("meta").length - 1
        Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("meta")(i).Name, _
                    oURLHTMLDoc.getElementsByTagName("meta")(i).content
    Next

    Debug.Print
    'Extract Heading1 Elements
    For i = 0 To oURLHTMLDoc.getElementsByTagName("h1").length - 1
        Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h1")(i).innerHTML
    Next

    Debug.Print
    'Extract Heading2 Elements
    For i = 0 To oURLHTMLDoc.getElementsByTagName("h2").length - 1
        Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h2")(i).innerHTML
    Next

    Debug.Print
    'Extract Article Published Date
    For i = 0 To oURLHTMLDoc.getElementsByTagName("time").length - 1
        If oURLHTMLDoc.getElementsByTagName("time")(i).getAttribute("class") = "entry-date" Then
            Debug.Print i + 1, "Published", oURLHTMLDoc.getElementsByTagName("time")(i).innerHTML
        End If
    Next

    Set oURLHTMLDoc = Nothing
    Set oIHTMLDoc = Nothing
    Set oHTMLDoc = Nothing
End Sub

 

Using MSXML2.XMLHTTP

Another similar approach would be to use MSXML2.XMLHTTP and the code for that would look like:

'Requires Reference to: Microsoft HTML Object Library
Sub XMLHTTP_GetURLInfo()
    Dim oHttp                 As Object
    Dim oURLHTMLDoc           As MSHTML.HTMLDocument
    Dim i                     As Long
    Const sURL                As String = "https://www.devhut.net/avoiding-the-followhyperlink-security-warning/"

    On Error GoTo Error_Handler

    '    Set oHttp = CreateObject("MSXML2.ServerXMLHTTP")
    Set oHttp = CreateObject("MSXML2.XMLHTTP.6.0")

    Call oHttp.Open("GET", sURL, False)
    Call oHttp.Send
    'Check for any errors reported by the server
    If oHttp.Status >= 400 And oHttp.Status <= 599 Then
        GoTo Error_Handler
    End If

    Set oURLHTMLDoc = New MSHTML.HTMLDocument    'CreateObject("HTMLFile")
    Do Until oURLHTMLDoc.ReadyState = "complete"
        DoEvents
    Loop

    Set oURLHTMLDoc = New MSHTML.HTMLDocument    'CreateObject("HTMLFile")
    oURLHTMLDoc.body.innerHTML = oHttp.ResponseText

    '    oURLHTMLDoc.documentElement.outerHTML          'Full Page source

    'Extract all the META tag content
    For i = 0 To oURLHTMLDoc.getElementsByTagName("meta").length - 1
        Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("meta")(i).Name, _
                    oURLHTMLDoc.getElementsByTagName("meta")(i).content
    Next

    Debug.Print
    'Extract Heading1 Elements
    For i = 0 To oURLHTMLDoc.getElementsByTagName("h1").length - 1
        Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h1")(i).innerHTML
    Next

    Debug.Print
    'Extract Heading2 Elements
    For i = 0 To oURLHTMLDoc.getElementsByTagName("h2").length - 1
        Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h2")(i).innerHTML
    Next

    Debug.Print
    'Extract Article Published Date
    For i = 0 To oURLHTMLDoc.getElementsByTagName("time").length - 1
        If oURLHTMLDoc.getElementsByTagName("time")(i).getAttribute("class") = "entry-date" Then
            Debug.Print i + 1, "Published", oURLHTMLDoc.getElementsByTagName("time")(i).innerHTML
        End If
    Next

Error_Handler_Exit:
    On Error Resume Next
    Set oURLHTMLDoc = Nothing
    Call oHttp.Close
    Set oHttp = Nothing
    Exit Sub

Error_Handler:
    If oHttp.Status >= 400 And oHttp.Status <= 599 Then
        MsgBox "The following error has occurred." & vbCrLf & vbCrLf & _
               "Error Number: " & oHttp.Status & vbCrLf & _
               "Error Source: XMLHTTP_GetURLInfo" & vbCrLf & _
               "Error Description: " & oHttp.StatusText, _
               vbCritical, "An Error has Occurred!"
    Else
        MsgBox "The following error has occured" & vbCrLf & vbCrLf & _
               "Error Number: " & Err.Number & vbCrLf & _
               "Error Source: XMLHTTP_GetURLInfo" & vbCrLf & _
               "Error Description: " & Err.Description & _
               Switch(Erl = 0, "", Erl <> 0, vbCrLf & "Line No: " & Erl) _
               , vbOKOnly + vbCritical, "An Error has Occured!"
    End If
    Resume Error_Handler_Exit
End Sub

A Few Resources on the Subject

5 responses on “Getting Website Info Without Using a WebBrowser Control