In my original post:
I demonstrated how we could use Internet Explorer automation to get information back from any site.
Then I demonstrated how you could do the same using the WebBrowser control in my article
Now these techniques are great if you want user interaction and them give them control over certain aspects as they can interact with IE/webbrowser control.
But what if you don’t need any user interaction?
There is another, simple, approach that can do exactly that all in VBA without any need for Internet Explorer or a form/webbrowser control!
Let’s dive in!
Using The MSHTML createDocumentFromUrl Method
This approach is quite simple, we create an HTMLDocument object and then use it to execute the createDocumentFromUrl method to create another HTMLDocument object that we can work with. Sounds complicated, but it truly isn’t.
Here’s what is looks like in practice:
'Requires Reference to: Microsoft HTML Object Library
Sub MSHTML_GetURLInfo()
Dim oHTMLDoc As MSHTML.HTMLDocument
Dim oIHTMLDoc As MSHTML.IHTMLDocument2
Dim oURLHTMLDoc As MSHTML.HTMLDocument
Dim i As Long
Const sURL As String = "https://www.devhut.net/avoiding-the-followhyperlink-security-warning/"
Set oHTMLDoc = New MSHTML.HTMLDocument
Set oIHTMLDoc = oHTMLDoc.createDocumentFromUrl(sURL, vbNullString)
Do Until oIHTMLDoc.ReadyState = "complete"
DoEvents
Loop
' oIHTMLDoc.title 'Page Title
' oIHTMLDoc.location 'URL
Set oURLHTMLDoc = oIHTMLDoc.body.Document
' oURLHTMLDoc.documentElement.outerHTML 'Full Page source
'Extract all the META tag content
For i = 0 To oURLHTMLDoc.getElementsByTagName("meta").length - 1
Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("meta")(i).Name, _
oURLHTMLDoc.getElementsByTagName("meta")(i).content
Next
Debug.Print
'Extract Heading1 Elements
For i = 0 To oURLHTMLDoc.getElementsByTagName("h1").length - 1
Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h1")(i).innerHTML
Next
Debug.Print
'Extract Heading2 Elements
For i = 0 To oURLHTMLDoc.getElementsByTagName("h2").length - 1
Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h2")(i).innerHTML
Next
Debug.Print
'Extract Article Published Date
For i = 0 To oURLHTMLDoc.getElementsByTagName("time").length - 1
If oURLHTMLDoc.getElementsByTagName("time")(i).getAttribute("class") = "entry-date" Then
Debug.Print i + 1, "Published", oURLHTMLDoc.getElementsByTagName("time")(i).innerHTML
End If
Next
Set oURLHTMLDoc = Nothing
Set oIHTMLDoc = Nothing
Set oHTMLDoc = Nothing
End Sub
Using MSXML2.XMLHTTP
Another similar approach would be to use MSXML2.XMLHTTP and the code for that would look like:
'Requires Reference to: Microsoft HTML Object Library
Sub XMLHTTP_GetURLInfo()
Dim oHttp As Object
Dim oURLHTMLDoc As MSHTML.HTMLDocument
Dim i As Long
Const sURL As String = "https://www.devhut.net/avoiding-the-followhyperlink-security-warning/"
On Error GoTo Error_Handler
' Set oHttp = CreateObject("MSXML2.ServerXMLHTTP")
Set oHttp = CreateObject("MSXML2.XMLHTTP.6.0")
Call oHttp.Open("GET", sURL, False)
Call oHttp.Send
'Check for any errors reported by the server
If oHttp.Status >= 400 And oHttp.Status <= 599 Then
GoTo Error_Handler
End If
Set oURLHTMLDoc = New MSHTML.HTMLDocument 'CreateObject("HTMLFile")
Do Until oURLHTMLDoc.ReadyState = "complete"
DoEvents
Loop
Set oURLHTMLDoc = New MSHTML.HTMLDocument 'CreateObject("HTMLFile")
oURLHTMLDoc.body.innerHTML = oHttp.ResponseText
' oURLHTMLDoc.documentElement.outerHTML 'Full Page source
'Extract all the META tag content
For i = 0 To oURLHTMLDoc.getElementsByTagName("meta").length - 1
Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("meta")(i).Name, _
oURLHTMLDoc.getElementsByTagName("meta")(i).content
Next
Debug.Print
'Extract Heading1 Elements
For i = 0 To oURLHTMLDoc.getElementsByTagName("h1").length - 1
Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h1")(i).innerHTML
Next
Debug.Print
'Extract Heading2 Elements
For i = 0 To oURLHTMLDoc.getElementsByTagName("h2").length - 1
Debug.Print i + 1, oURLHTMLDoc.getElementsByTagName("h2")(i).innerHTML
Next
Debug.Print
'Extract Article Published Date
For i = 0 To oURLHTMLDoc.getElementsByTagName("time").length - 1
If oURLHTMLDoc.getElementsByTagName("time")(i).getAttribute("class") = "entry-date" Then
Debug.Print i + 1, "Published", oURLHTMLDoc.getElementsByTagName("time")(i).innerHTML
End If
Next
Error_Handler_Exit:
On Error Resume Next
Set oURLHTMLDoc = Nothing
Call oHttp.Close
Set oHttp = Nothing
Exit Sub
Error_Handler:
If oHttp.Status >= 400 And oHttp.Status <= 599 Then
MsgBox "The following error has occurred." & vbCrLf & vbCrLf & _
"Error Number: " & oHttp.Status & vbCrLf & _
"Error Source: XMLHTTP_GetURLInfo" & vbCrLf & _
"Error Description: " & oHttp.StatusText, _
vbCritical, "An Error has Occurred!"
Else
MsgBox "The following error has occured" & vbCrLf & vbCrLf & _
"Error Number: " & Err.Number & vbCrLf & _
"Error Source: XMLHTTP_GetURLInfo" & vbCrLf & _
"Error Description: " & Err.Description & _
Switch(Erl = 0, "", Erl <> 0, vbCrLf & "Line No: " & Erl) _
, vbOKOnly + vbCritical, "An Error has Occured!"
End If
Resume Error_Handler_Exit
End Sub
hello
i was using this method : https://www.tek-tips.com/faqs.cfm?fid=7299
that is working for me , but take so much time to load the page and meanwhile Microsoft Access would not respond untill it is done… with your method, it would same a lot of time right ?
Access is single threaded, so it will always delay, become unresponsive, when performing any action.
i tried to change the url to : https://www.ebay.fr/itm/133892366265
it opens internet explorer with url : about:blank#https://www.ebay.fr/itm/133892366265
any advice to avoid that ?
Without seeing your code it is impossible to comment, but it seems like you are appending the url rather than overwriting it.
Hello!
Could you figured out how to solve your problem?