Tweaked Internet Explorer web scrape code will not work using Get request

Indominus

Board Regular
Joined
Jul 11, 2020
Messages
98
Office Version
  1. 2016
Platform
  1. Windows
Hi. So I have this web scrape code that pulls data from a work website using Internet Explorer. Works fine. However, when there is a lot of data it is slow. So I have been trying to convert it using Get request since it seems to be quicker. However, it pulls nothing. Running the code step by step it skips all lines after “For Each Div in Divs” and goes straight to End sub. Here are my the two codes. I thought I only changed what needed to in order to switch it form IE to Get request. Thank you in advance!

VBA Code:
[/
Dim Document    As HTMLDocument
'
    Dim Div         As IHTMLElement
    Dim H3          As IHTMLElement
    Dim Table       As IHTMLElement
    Dim TD          As IHTMLElement
    Dim TR          As IHTMLElement
'
    Dim Divs        As IHTMLElementCollection
    Dim Tables      As IHTMLElementCollection
    Dim TDs         As IHTMLElementCollection
    Dim TRs         As IHTMLElementCollection
'
    Dim Column      As Integer
    Dim Row         As Integer
'
    Dim Browser     As InternetExplorer
'
    Dim URL         As String
'
    Dim ws          As Worksheet
'
    Set ws = ThisWorkbook.Worksheets("Setup")
'
    Row = 1
    Column = 1
'
    Set ws = Sheets("PROCESS")
'
    ws.Cells.Clear
'
    URL = "Work URL"
'
    Set Browser = New InternetExplorerMedium
'
    Browser.navigate URL
'
'   Wait for page to load
    Do While Browser.Busy Or Browser.readyState <> READYSTATE_COMPLETE
        DoEvents
    Loop
'
'   Scan the document
    Set Document = Browser.Document
'
    Set Divs = Document.getElementById("secondaryProductivityList").getElementsByTagName("div")
'
    For Each Div In Divs
        Set H3 = Div.getElementsByTagName("h3")(0)
'
        If Not Div.className = "floatHeader" And Not H3 Is Nothing Then
            ws.Cells(Row, 1).Value = H3.innerText
            Row = Row + 1
'
            Set Tables = Div.getElementsByTagName("table")
            Set Table = Tables(0)
            Set TRs = Table.getElementsByTagName("tr")
'
            For Each TR In TRs
                Column = 1
'
                Set TDs = TR.getElementsByTagName("th")
'
                For Each TD In TDs
                    ws.Cells(Row, Column).Value = TD.innerText
                    ws.Cells(Row, Column).Font.Bold = True
'
                    If TD.getAttribute("colspan") Then
                        Column = Column + TD.getAttribute("colspan")
                    Else
                        Column = Column + 1
                    End If
                Next
'
                Set TDs = TR.getElementsByTagName("td")
'
                For Each TD In TDs
                    ws.Cells(Row, Column).Value = TD.innerText
                    Column = Column + 1
                Next
'
                Row = Row + 1
            Next
        End If
'
        Row = Row + 1
    Next
'
    Browser.Quit
'
  

Get Request Code


Dim Table As IHTMLElement
Dim Tables As IHTMLElementCollection
Dim Div As IHTMLElement
Dim Divs As IHTMLElementCollection
Dim H3 As IHTMLElement
Dim TR As IHTMLElement
Dim TRs  As IHTMLElementCollection
Dim TD As IHTMLElement
Dim TDs  As IHTMLElementCollection
Dim Row As Integer
Dim Column As Integer

Dim ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Setup")
  
Row = 1
Column = 1
Set ws = Sheets("PROCESS")
ws.Cells.Clear

Dim H As Object, doc As New HTMLDocument
Set H = CreateObject("WinHTTP.WinHTTPRequest.5.1")

URL = "WORK URL"

Debug.Print URL

ReTry:

H.SetAutoLogonPolicy 0
H.setTimeouts 0, 0, 0, 0
H.Open "GET", URL, False
H.send
H.waitForResponse

If H.Status <> 200 Then
    MsgBox H.Status & " - " & H.statusText
    Exit Sub
End If

Debug.Print URL

doc.body.innerHTML = H.responseText

Set Divs =doc.getElementById("secondaryProductivityList").getElementsByTagName("div")

For Each Div In Divs
    Set H3 = Div.getElementsByTagName("h3")(0)

    If Not Div.className = "floatHeader" And Not H3 Is Nothing Then
        ws.Cells(Row, 1).Value = H3.innerText
        Row = Row + 1

        Set Tables = Div.getElementsByTagName("table")
        Set Table = Tables(0)
        Set TRs = Table.getElementsByTagName("tr")
        For Each TR In TRs
            Column = 1
            Set TDs = TR.getElementsByTagName("th")
            For Each TD In TDs
                ws.Cells(Row, Column).Value = TD.innerText
                ws.Cells(Row, Column).Font.Bold = True
                If TD.getAttribute("colspan") Then
                    Column = Column + TD.getAttribute("colspan")
                Else
                    Column = Column + 1
                End If
            Next
       
            Set TDs = TR.getElementsByTagName("td")
            For Each TD In TDs
                ws.Cells(Row, Column).Value = TD.innerText
                Column = Column + 1
            Next
  
            Row = Row + 1
        Next
    End If
    Row = Row + 1
Next


End Sub
 
Last edited:

Excel Facts

Which Excel functions can ignore hidden rows?
The SUBTOTAL and AGGREGATE functions ignore hidden rows. AGGREGATE can also exclude error cells and more.

diddi

Well-known Member
Joined
May 20, 2004
Messages
3,297
Office Version
  1. 2010
Platform
  1. Windows
you need to check your URL network inspector to see if it should be a POST rather than a GET

heres a sample xmlhttp to look at also

VBA Code:
Sub ScrapeNow2()
    Dim xmlhttp As XMLHTTP60
    Dim oDom  As HTMLDocument

    Set xmlhttp = New MSXML2.ServerXMLHTTP60
    Set oDom = New HTMLDocument
    
    MyURL = "https://..."
            With xmlhttp
                .Open "POST", MyURL, False
                .send
                oDom.body.innerHTML = .responseText
            End With
etc.... this gives access the the DOM model for a html document
 

Indominus

Board Regular
Joined
Jul 11, 2020
Messages
98
Office Version
  1. 2016
Platform
  1. Windows
you need to check your URL network inspector to see if it should be a POST rather than a GET

heres a sample xmlhttp to look at also

VBA Code:
Sub ScrapeNow2()
    Dim xmlhttp As XMLHTTP60
    Dim oDom  As HTMLDocument

    Set xmlhttp = New MSXML2.ServerXMLHTTP60
    Set oDom = New HTMLDocument
   
    MyURL = "https://..."
            With xmlhttp
                .Open "POST", MyURL, False
                .send
                oDom.body.innerHTML = .responseText
            End With
etc.... this gives access the the DOM model for a html document
So if it is a Post do I do this and put the rest of my same variables and coding?
 

diddi

Well-known Member
Joined
May 20, 2004
Messages
3,297
Office Version
  1. 2010
Platform
  1. Windows
change GET to POST
 

Indominus

Board Regular
Joined
Jul 11, 2020
Messages
98
Office Version
  1. 2016
Platform
  1. Windows

ADVERTISEMENT

change GET to POST
So upon inspecting the page it used GET. I watched a tutorial video on Youtube and got this code. However, it is giving me error 13 "mismatch" on line. I verified every line is exactly the same on the IE code starting on this line. So not sure why the error.
Set Divs = Document.getElementById("secondaryProductivityList").getElementsByTagName("div"). Any ideas? Here is the new code. Thank you by the way.

VBA Code:
    Dim Table As MSHTML.IHTMLElement
    Dim Tables As MSHTML.HTMLElementCollection
    Dim Div As MSHTML.IHTMLElement
    Dim Divs As MSHTML.HTMLElementCollection
    Dim H3 As MSHTML.IHTMLElement
    Dim TR As MSHTML.IHTMLElement
    Dim TRs  As MSHTML.HTMLElementCollection
    Dim TD As MSHTML.IHTMLElement
    Dim TDs  As MSHTML.HTMLElementCollection
    Dim Row As Integer
    Dim Column As Integer
 

    Row = 1
    Column = 1
 

    Dim req As New MSXML2.XMLHTTP60
    Dim reqURL As String
    Dim Document As New MSHTML.HTMLDocument
 
    reqURL = "Work email"
 
  
                req.Open "GET", reqURL, False
                req.send
             
                If req.Status <> 200 Then
    MsgBox req.Status & " - " & req.statusText
    Exit Sub
End If

        Document.body.innerHTML = req.responseText
         
 

 
    Set Divs = Document.getElementById("secondaryProductivityList").getElementsByTagName("div")
    For Each Div In Divs
        Set H3 = Div.getElementsByTagName("h3")(0)
     
        If Not Div.className = "floatHeader" And Not H3 Is Nothing Then
            ws.Cells(Row, 1).Value = H3.innerText
            Row = Row + 1
         
            Set Tables = Div.getElementsByTagName("table")
            Set Table = Tables(0)
            Set TRs = Table.getElementsByTagName("tr")
            For Each TR In TRs
                Column = 1
                Set TDs = TR.getElementsByTagName("th")
                For Each TD In TDs
                    ws.Cells(Row, Column).Value = TD.innerText
                    ws.Cells(Row, Column).Font.Bold = True
                    If TD.getAttribute("colspan") Then
                        Column = Column + TD.getAttribute("colspan")
                    Else
                        Column = Column + 1
                    End If
                    Next
                 
                Set TDs = TR.getElementsByTagName("td")
                For Each TD In TDs
                    ws.Cells(Row, Column).Value = TD.innerText
                    Column = Column + 1
                    Next
             
            Row = Row + 1
            Next
        End If
    Row = Row + 1
    Next
 
 
End Sub
 

Marc L

Active Member
Joined
Apr 5, 2021
Messages
423
Office Version
  1. 2010
Platform
  1. Windows
Hi, as it seems obvious the code can't be the same when piloting IE​
as when using a HTMLDocument two 'getElements' statements can't be combined in the same codeline !​
A 'how to' sample in this thread :​
 

Indominus

Board Regular
Joined
Jul 11, 2020
Messages
98
Office Version
  1. 2016
Platform
  1. Windows

ADVERTISEMENT

Hi, as it seems obvious the code can't be the same when piloting IE​
as when using a HTMLDocument two 'getElements' statements can't be combined in the same codeline !​
A 'how to' sample in this thread :​
Hi, I looked through it. I see that that code only has one Get Elements statement per line but overall it is a little different from mine. Not a lot of those lines overall. So how exactly would I separate those two Get Element statements in mine to have it function correctly? Thank you so much for your help by the way. Been working on this for 3 days now
 

Marc L

Active Member
Joined
Apr 5, 2021
Messages
423
Office Version
  1. 2010
Platform
  1. Windows
Wrong so you misread / misunderstood as it's exactly the same case !​
As well shown in the sample each 'getElements' must have its own HTMLDocument …​
 

Indominus

Board Regular
Joined
Jul 11, 2020
Messages
98
Office Version
  1. 2016
Platform
  1. Windows
Wrong so you misread / misunderstood as it's exactly the same case !​
As well shown in the sample each 'getElements' must have its own HTMLDocument …​
Oh okay. I see it now. I edited it to separate them. However, now I get error code 91, “object variable or with block variable not set”. On the new line. Set LPs = LP.getElementsByTagName("div"). Here is the new code I have. Thanks!

VBA Code:
Dim Table As MSHTML.IHTMLElement

Dim Tables As MSHTML.HTMLElementCollection

Dim Div As MSHTML.IHTMLElement

Dim Divs As MSHTML.HTMLElementCollection

Dim H3 As MSHTML.IHTMLElement

Dim TR As MSHTML.IHTMLElement

Dim TRs As MSHTML.HTMLElementCollection

Dim TD As MSHTML.IHTMLElement

Dim TDs As MSHTML.HTMLElementCollection

Dim LPs As MSHTML.HTMLElementCollection

Dim LP As MSHTML.IHTMLElement

Dim Row As Integer

Dim Column As Integer





Row = 1

Column = 1





Dim req As New MSXML2.XMLHTTP60

Dim reqURL As String

Dim Document As New MSHTML.HTMLDocument



reqURL = "Work URL”


req.Open "GET", reqURL, False

req.send



If req.Status <> 200 Then

MsgBox req.Status & " - " & req.statusText

Exit Sub

End If



Document.body.innerHTML = req.responseText


Set Divs = Document.getElementById("secondaryProductivityList")

Set LPs = LP.getElementsByTagName("div")



For Each Div In Divs

Set H3 = Div.getElementsByTagName("h3")(0)



If Not Div.className = "floatHeader" And Not H3 Is Nothing Then

ws.Cells(Row, 1).Value = H3.innerText

Row = Row + 1



Set Tables = Div.getElementsByTagName("table")

Set Table = Tables(0)

Set TRs = Table.getElementsByTagName("tr")

For Each TR In TRs

Column = 1

Set TDs = TR.getElementsByTagName("th")

For Each TD In TDs

ws.Cells(Row, Column).Value = TD.innerText

ws.Cells(Row, Column).Font.Bold = True

If TD.getAttribute("colspan") Then

Column = Column + TD.getAttribute("colspan")

Else

Column = Column + 1

End If

Next



Set TDs = TR.getElementsByTagName("td")

For Each TD In TDs

ws.Cells(Row, Column).Value = TD.innerText

Column = Column + 1

Next



Row = Row + 1

Next

End If

Row = Row + 1

Next





End Sub
 

Marc L

Active Member
Joined
Apr 5, 2021
Messages
423
Office Version
  1. 2010
Platform
  1. Windows
Use a break point or the step by step debug mode to check in the VBE Locals window if the variable LP is different than Nothing​
as I guess you just have forgotten to allocate / initialize it !​
 

Watch MrExcel Video

Forum statistics

Threads
1,129,362
Messages
5,635,825
Members
416,885
Latest member
hs11

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Top