Page 2 of 2 FirstFirst 12
Results 11 to 11 of 11

Thread: Updating web scraping code
Thanks Thanks: 0 Likes Likes: 0

  1. #11
    Board Regular Sharid's Avatar
    Join Date
    Apr 2007
    Posts
    479
    Post Thanks / Like
    Mentioned
    0 Post(s)
    Tagged
    0 Thread(s)

    Default Re: Need help on updating this code

    Its ok i have got it sorted, just need to add your line on deleting duplicates,

    Code:
    Option Explicit
    'VBE > Tools > References > HTML Object Library
    Public Sub test()
        Dim ie As Object, ws As Worksheet, wsUrls As Worksheet, urls()
        Set ie = CreateObject("InternetExplorer.Application")
        Set ws = ThisWorkbook.Worksheets("Scraper")
        Set wsUrls = ThisWorkbook.Worksheets("Url List")
    
        With wsUrls
            urls = Application.Transpose(.Range("A2:A" & .Cells(.rows.Count, "A").End(xlUp).Row).Value)
        End With
        Dim results(), r As Long
        ReDim results(1 To UBound(urls), 1 To 2)
    
        With ie
            .Visible = True
    
            For r = LBound(urls) To UBound(urls)
                .Navigate2 urls(r)
    
                While .Busy Or .readyState < 4: DoEvents: Wend
    
                With .document
    
                    Dim email As String, website As String, iconCssSelector As String
                    'iconCssSelector for website icon in this instance
                    iconCssSelector = "[src='https://static.xx.fbcdn.net/rsrc.php/v3/yV/r/EaDvTjOwxIV.png']"
    
                    If ElementIsPresent(ie.document, "[href^=mailto]") Then
                        email = ie.document.querySelector("[href^=mailto]").innerText
                    Else
                        email = "Not found"
                    End If
    
                    Dim parents As Object, sharedParentCssSelector As String, childOfSiblingCssSelector As String
                    sharedParentCssSelector = "._5aj7" 'target parent of both icon and the website link
                    childOfSiblingCssSelector = "._50f4" '< to target website address after finding right parent
    
                    If ElementIsPresent(ie.document, iconCssSelector) _
            And ElementIsPresent(ie.document, sharedParentCssSelector) Then
    
                        Set parents = ie.document.querySelectorAll(sharedParentCssSelector) 'css selector used to allow for greater flexibility in element matching
                        website = GetText(ie.document, parents, iconCssSelector, childOfSiblingCssSelector)
                    Else
                        website = "Not found"
                    End If
                End With
                'Assumes headers already present
                Dim nextRow As Long
                results(r, 1) = email
                results(r, 2) = website
            Next
            .Quit
        End With
        nextRow = GetLastRow(ws, 1) + 1
        ws.Cells(nextRow, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End Sub
    
    Public Function ElementIsPresent(ByVal document As HTMLDocument, ByVal cssSelector As String) As Boolean
        ElementIsPresent = document.querySelectorAll(cssSelector).length > 0
    End Function
    
    Public Function GetText(ByVal document As HTMLDocument, ByVal parents As Object, ByVal iconCssSelector As String, ByVal childOfSiblingCssSelector As String) As String
        'in this instance and with microsoft IE DOM you cannot select for parent of an element with pseudo class _
        of :has(>child); nor use :contains... instead pass expected parent selector, that houses _
        both the icon element for website and the website address itself, and loop all matches checking for website icon _
        if found use childOfSiblingCssSelector to extract
        Dim i As Long, html As HTMLDocument
        Set html = New HTMLDocument
    
        For i = 0 To parents.length - 1
            html.body.innerHTML = parents.item(i).innerHTML
            If ElementIsPresent(html, iconCssSelector) Then
                GetText = html.querySelector(childOfSiblingCssSelector).innerText
                Exit Function
            End If
        Next
        GetText = "Not found"
    End Function
    
    Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
        With ws
            GetLastRow = .Cells(.rows.Count, columnNumber).End(xlUp).Row
        End With
    End Function
    Last edited by Sharid; May 26th, 2019 at 12:52 PM.
    If I could code in VB, life wouldn't be such a pain in the A$$

Some videos you may like

User Tag List

Tags for this Thread

Like this thread? Share it with others

Like this thread? Share it with others

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •