Updating web scraping code


Well-known Member
Apr 22, 2007
Office Version
Re: Need help on updating this code

Its ok i have got it sorted, just need to add your line on deleting duplicates,

Option Explicit
'VBE > Tools > References > HTML Object Library
Public Sub test()
    Dim ie As Object, ws As Worksheet, wsUrls As Worksheet, urls()
    Set ie = CreateObject("InternetExplorer.Application")
    Set ws = ThisWorkbook.Worksheets("Scraper")
    Set wsUrls = ThisWorkbook.Worksheets("Url List")

    With wsUrls
        urls = Application.Transpose(.Range("A2:A" & .Cells(.rows.Count, "A").End(xlUp).Row).Value)
    End With
    Dim results(), r As Long
    ReDim results(1 To UBound(urls), 1 To 2)

    With ie
        .Visible = True

        For r = LBound(urls) To UBound(urls)
            .Navigate2 urls(r)

            While .Busy Or .readyState < 4: DoEvents: Wend

            With .document

                Dim email As String, website As String, iconCssSelector As String
                'iconCssSelector for website icon in this instance
                iconCssSelector = "[src='https://static.xx.fbcdn.net/rsrc.php/v3/yV/r/EaDvTjOwxIV.png']"

                If ElementIsPresent(ie.document, "[href^=mailto]") Then
                    email = ie.document.querySelector("[href^=mailto]").innerText
                    email = "Not found"
                End If

                Dim parents As Object, sharedParentCssSelector As String, childOfSiblingCssSelector As String
                sharedParentCssSelector = "._5aj7" 'target parent of both icon and the website link
                childOfSiblingCssSelector = "._50f4" '< to target website address after finding right parent

                If ElementIsPresent(ie.document, iconCssSelector) _
        And ElementIsPresent(ie.document, sharedParentCssSelector) Then

                    Set parents = ie.document.querySelectorAll(sharedParentCssSelector) 'css selector used to allow for greater flexibility in element matching
                    website = GetText(ie.document, parents, iconCssSelector, childOfSiblingCssSelector)
                    website = "Not found"
                End If
            End With
            'Assumes headers already present
            Dim nextRow As Long
            results(r, 1) = email
            results(r, 2) = website
    End With
    nextRow = GetLastRow(ws, 1) + 1
    ws.Cells(nextRow, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

Public Function ElementIsPresent(ByVal document As HTMLDocument, ByVal cssSelector As String) As Boolean
    ElementIsPresent = document.querySelectorAll(cssSelector).length > 0
End Function

Public Function GetText(ByVal document As HTMLDocument, ByVal parents As Object, ByVal iconCssSelector As String, ByVal childOfSiblingCssSelector As String) As String
    'in this instance and with microsoft IE DOM you cannot select for parent of an element with pseudo class _
    of :has(>child); nor use :contains... instead pass expected parent selector, that houses _
    both the icon element for website and the website address itself, and loop all matches checking for website icon _
    if found use childOfSiblingCssSelector to extract
    Dim i As Long, html As HTMLDocument
    Set html = New HTMLDocument

    For i = 0 To parents.length - 1
        html.body.innerHTML = parents.item(i).innerHTML
        If ElementIsPresent(html, iconCssSelector) Then
            GetText = html.querySelector(childOfSiblingCssSelector).innerText
            Exit Function
        End If
    GetText = "Not found"
End Function

Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
    With ws
        GetLastRow = .Cells(.rows.Count, columnNumber).End(xlUp).Row
    End With
End Function
Last edited:

Some videos you may like

Excel Facts

Square and cube roots
The =SQRT(25) is a square root. For a cube root, use =125^(1/3). For a fourth root, use =625^(1/4).

Watch MrExcel Video

Forum statistics

Latest member

This Week's Hot Topics