Private Sub EmailExtractBut()
'Extract emails only from urls
'Columns for both tables
Const colUrl As Long = 1 'Must always be the first column
Const colMail As Long = 2 'Must always be the first column before SoMe platforms
Const colFacebook As Long = 3
Const colInstagram As Long = 4
Const colTwitter As Long = 5
Const colYouTube As Long = 6
Const colLinkedIn As Long = 7 'Must always be the last column of Some platforms
Const colError As Long = 9 'Must always be the last column
Dim url As String
Dim http As Object
Dim htmlDoc As Object
Dim nodeAllLinks As Object
Dim nodeOneLink As Object
Dim pageLoadSuccessful As Boolean
Dim tableUrlsOneAddressLeft As String
Dim tableAllAddresses As String
Dim currentRowTableUrls As Long
Dim lastRowTableUrls As Long
Dim currentRowsTableAll(colUrl To colLinkedIn) As Long
Dim lastRowTableAll As Long
Dim addressCounters(colMail To colLinkedIn) As Long
Dim checkCounters As Long
Dim myCounter As Long
'Initialize variables
tableUrlsOneAddressLeft = "Sheet9" 'change to sheet9
currentRowTableUrls = 2 'First row for content
tableAllAddresses = "Sheet8" 'chanhe to sheet8
DoEvents
For checkCounters = colUrl To colLinkedIn
currentRowsTableAll(checkCounters) = 2 'First rows for content
DoEvents
Next checkCounters
Set htmlDoc = CreateObject("htmlfile")
On Error Resume Next
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
'Clear all contents and comments in the URL source sheet from email column to error column
DoEvents
With Sheets(tableUrlsOneAddressLeft)
lastRowTableUrls = .Cells(Rows.Count, colUrl).End(xlUp).Row
.Range(.Cells(currentRowTableUrls, colMail), .Cells(lastRowTableUrls, colError)).ClearContents
.Range(.Cells(currentRowTableUrls, colMail), .Cells(lastRowTableUrls, colError)).ClearComments
End With
'Delete all rows except headline in the sheet with all addresses
lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.Count, colUrl).End(xlUp).Row
Sheets(tableAllAddresses).Rows(currentRowsTableAll(colUrl) & ":" & lastRowTableAll).Delete Shift:=xlUp
'add new headings
ThisWorkbook.Worksheets("Sheet8").Range("A1").Value = "Domain Urls"
ThisWorkbook.Worksheets("Sheet8").Range("B1").Value = "Emails Found"
ThisWorkbook.Worksheets("Sheet8").Range("C1").Value = "Facebook Urls "
ThisWorkbook.Worksheets("Sheet8").Range("D1").Value = "Instagram Urls"
ThisWorkbook.Worksheets("Sheet8").Range("E1").Value = "Twitter Urls"
ThisWorkbook.Worksheets("Sheet8").Range("F1").Value = "Youtube Urls"
ThisWorkbook.Worksheets("Sheet8").Range("G1").Value = "LinkedIn Urls"
'ThisWorkbook.Worksheets("Sheet10").Range("j5").Value = "True"
Application.ScreenUpdating = False
'Loop over all URLs in column A in the URL source sheet
'DoEvents
Do While Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Value <> ""
'Scroll for visual monitoring, if
'the sheet with the URLs are the
'active one
If ActiveSheet.Name = tableUrlsOneAddressLeft Then
If currentRowTableUrls > 1 Then 'change this variable
ActiveWindow.SmallScroll down:=1
End If
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Select
End If
'Get next url from the URL source sheet
url = Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colUrl).Value
'Try to load page
'Temporarily disable error handling if
'there is a timeout or onother error
On Error Resume Next
http.Open "GET", url, False
On Error Resume Next
http.send
'Check if page loading was successful
DoEvents
If Err.Number = 0 Then
pageLoadSuccessful = True
End If
On Error GoTo 0
If pageLoadSuccessful Then
'Build html document for DOM operations
On Error Resume Next
htmlDoc.body.innerHTML = http.responseText
'Create node list from all links of the page
On Error Resume Next
Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
'Walk through all links of the node list
For Each nodeOneLink In nodeAllLinks
Application.ScreenUpdating = False
DoEvents
If InStr(1, nodeOneLink.href, "mailto:") Then
'Write mail address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colMail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colMail), colMail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colMail) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment mail counters
currentRowsTableAll(colMail) = currentRowsTableAll(colMail) + 1
addressCounters(colMail) = addressCounters(colMail) + 1
End If
'Check for Facebook address
If InStr(1, UCase(nodeOneLink.href), "FACEBOOK") Then
'Write Facebook address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colFacebook).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colFacebook), colFacebook).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colFacebook) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment Facebook counters
currentRowsTableAll(colFacebook) = currentRowsTableAll(colFacebook) + 1
addressCounters(colFacebook) = addressCounters(colFacebook) + 1
End If
'Check for Instagram address
If InStr(1, UCase(nodeOneLink.href), "INSTAGRAM") Then
'Write INSTAGRAM address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colInstagram).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colInstagram), colInstagram).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colInstagram) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
currentRowsTableAll(colInstagram) = currentRowsTableAll(colInstagram) + 1
addressCounters(colInstagram) = addressCounters(colInstagram) + 1
End If
'Check for Twitter address
If InStr(1, UCase(nodeOneLink.href), "TWITTER") Then
'Write Twitter address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colTwitter).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colTwitter), colTwitter).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colTwitter) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment Twitter counters
currentRowsTableAll(colTwitter) = currentRowsTableAll(colTwitter) + 1
addressCounters(colTwitter) = addressCounters(colTwitter) + 1
End If
'Check for YouTube address
If InStr(1, UCase(nodeOneLink.href), "YOUTUBE") Then
'Write YouTube address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colYouTube).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colYouTube), colYouTube).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colYouTube) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment YouTube counters
currentRowsTableAll(colYouTube) = currentRowsTableAll(colYouTube) + 1
addressCounters(colYouTube) = addressCounters(colYouTube) + 1
End If
'Check for LinkedIn address
If InStr(1, UCase(nodeOneLink.href), "LINKEDIN") Then
'Write LinkedIn address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colLinkedIn).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colLinkedIn), colLinkedIn).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colLinkedIn) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment LinkedIn counters
currentRowsTableAll(colLinkedIn) = currentRowsTableAll(colLinkedIn) + 1
addressCounters(colLinkedIn) = addressCounters(colLinkedIn) + 1
End If
Next nodeOneLink
'Check address counters
For checkCounters = colMail To colLinkedIn
'Set comment if more than 1 link were found
If addressCounters(checkCounters) > 1 Then
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).AddComment Text:=CStr(addressCounters(checkCounters))
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).Comment.Shape.TextFrame.AutoSize = True
End If
Next checkCounters
Else
'Page not loaded
'Write message URL table
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colError).Value = "Error with URL or timeout"
End If
DoEvents
'Prepare for next page
pageLoadSuccessful = False
Erase addressCounters
lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.Count, colUrl).End(xlUp).Row
For checkCounters = colUrl To colLinkedIn
currentRowsTableAll(checkCounters) = lastRowTableAll + 1 'First rows for next page content
DoEvents
Next checkCounters
currentRowTableUrls = currentRowTableUrls + 1
'''Email + Social email tab controle source
With ExcelWebScraper.EmailSocialListBox1
Dim t As Double
.ColumnCount = 7
.ColumnWidths = "150;100;100;100;100;100;100"
.RowSource = "'" & Sheet8.Name & "'!$A$1:$i$" & Sheet8.Cells(Sheet8.Rows.Count, 1).End(xlUp).Row
t = Timer
Do Until Timer > t + 0.17
Loop
End With
'''loop counter, results in sheet10
myCounter = myCounter + 1
Worksheets("Sheet10").Range("G6").Value = myCounter
DoEvents
Application.ScreenUpdating = True
Loop
'Clean up
Set http = Nothing
Set htmlDoc = Nothing
Set nodeAllLinks = Nothing
Set nodeOneLink = Nothing
'Check if this works now
Complete.Show
Sheet10.Range("G6").Value = ""
'''delete duplicates in sheet8 column B
Dim Cl As Range, Rng As Range
With CreateObject("scripting.dictionary")
For Each Cl In Sheets("Sheet8").Range("B2", Sheets("Sheet8").Range("B" & Rows.Count).End(xlUp))
If Cl <> "" Then
If Not .Exists(Cl.Value) Then
.Add Cl.Value, Nothing
Else
If Rng Is Nothing Then Set Rng = Cl Else Set Rng = Union(Rng, Cl)
End If
End If
Next Cl
End With
If Not Rng Is Nothing Then Rng.EntireRow.Delete
''' remove blank rows from listbox view
LastRow = Sheet8.Cells(Sheet8.Rows.Count, "A").End(xlUp).Row
Sheet10.Range("G21").Value = LastRow - 1
'''Email + Social email tab controle source
With ExcelWebScraper.EmailSocialListBox1
.ColumnCount = 7
.ColumnWidths = "150;100;100;100;100;100;100"
.RowSource = "'" & Sheet8.Name & "'!$A$1:$i$" & Sheet8.Cells(Sheet8.Rows.Count, 1).End(xlUp).Row
End With
End Sub