Updating web scraping code

Sharid

Well-known Member
Joined
Apr 22, 2007
Messages
847
Office Version
2016
Platform
Windows
Re: Need help on updating this code

Its ok i have got it sorted, just need to add your line on deleting duplicates,

Code:
Option Explicit
'VBE > Tools > References > HTML Object Library
Public Sub test()
    Dim ie As Object, ws As Worksheet, wsUrls As Worksheet, urls()
    Set ie = CreateObject("InternetExplorer.Application")
    Set ws = ThisWorkbook.Worksheets("Scraper")
    Set wsUrls = ThisWorkbook.Worksheets("Url List")

    With wsUrls
        urls = Application.Transpose(.Range("A2:A" & .Cells(.rows.Count, "A").End(xlUp).Row).Value)
    End With
    Dim results(), r As Long
    ReDim results(1 To UBound(urls), 1 To 2)

    With ie
        .Visible = True

        For r = LBound(urls) To UBound(urls)
            .Navigate2 urls(r)

            While .Busy Or .readyState < 4: DoEvents: Wend

            With .document

                Dim email As String, website As String, iconCssSelector As String
                'iconCssSelector for website icon in this instance
                iconCssSelector = "[src='https://static.xx.fbcdn.net/rsrc.php/v3/yV/r/EaDvTjOwxIV.png']"

                If ElementIsPresent(ie.document, "[href^=mailto]") Then
                    email = ie.document.querySelector("[href^=mailto]").innerText
                Else
                    email = "Not found"
                End If

                Dim parents As Object, sharedParentCssSelector As String, childOfSiblingCssSelector As String
                sharedParentCssSelector = "._5aj7" 'target parent of both icon and the website link
                childOfSiblingCssSelector = "._50f4" '< to target website address after finding right parent

                If ElementIsPresent(ie.document, iconCssSelector) _
        And ElementIsPresent(ie.document, sharedParentCssSelector) Then

                    Set parents = ie.document.querySelectorAll(sharedParentCssSelector) 'css selector used to allow for greater flexibility in element matching
                    website = GetText(ie.document, parents, iconCssSelector, childOfSiblingCssSelector)
                Else
                    website = "Not found"
                End If
            End With
            'Assumes headers already present
            Dim nextRow As Long
            results(r, 1) = email
            results(r, 2) = website
        Next
        .Quit
    End With
    nextRow = GetLastRow(ws, 1) + 1
    ws.Cells(nextRow, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

Public Function ElementIsPresent(ByVal document As HTMLDocument, ByVal cssSelector As String) As Boolean
    ElementIsPresent = document.querySelectorAll(cssSelector).length > 0
End Function

Public Function GetText(ByVal document As HTMLDocument, ByVal parents As Object, ByVal iconCssSelector As String, ByVal childOfSiblingCssSelector As String) As String
    'in this instance and with microsoft IE DOM you cannot select for parent of an element with pseudo class _
    of :has(>child); nor use :contains... instead pass expected parent selector, that houses _
    both the icon element for website and the website address itself, and loop all matches checking for website icon _
    if found use childOfSiblingCssSelector to extract
    Dim i As Long, html As HTMLDocument
    Set html = New HTMLDocument

    For i = 0 To parents.length - 1
        html.body.innerHTML = parents.item(i).innerHTML
        If ElementIsPresent(html, iconCssSelector) Then
            GetText = html.querySelector(childOfSiblingCssSelector).innerText
            Exit Function
        End If
    Next
    GetText = "Not found"
End Function

Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
    With ws
        GetLastRow = .Cells(.rows.Count, columnNumber).End(xlUp).Row
    End With
End Function
 
Last edited:

Some videos you may like

Excel Facts

Copy PDF to Excel
Select data in PDF. Paste to Microsoft Word. Copy from Word and paste to Excel.

Watch MrExcel Video

Forum statistics

Threads
1,102,292
Messages
5,485,964
Members
407,523
Latest member
Talicius

This Week's Hot Topics

Top