Scrape what after STRING tag

YasserKhalil

Well-known Member
Joined
Jun 24, 2010
Messages
852
Hello everyone
I am trying to get the text after each STRONG tag but I can't figure it out
Here's the code so far
Code:
Sub Final()
    Dim xmlP As New MSXML2.XMLHTTP60
    Dim htmlSubyDoc As New MSHTML.HTMLDocument
    Dim htmlSubyResults As MSHTML.IHTMLElementCollection
    Dim htmlSubyResult As MSHTML.IHTMLElement
    Dim htmlStrongs As MSHTML.IHTMLElementCollection
    Dim htmlStrong As MSHTML.IHTMLElement
    
    Dim str As String
    Dim r As Long
    Dim i As Long
    Dim counter As Long


    Application.ScreenUpdating = False




    xmlP.Open "GET", "http://plants.newgarden.com/12190005/Plant/3394/Deodar_Cedar", False
    xmlP.send


    If xmlP.Status <> 200 Then
        MsgBox "Problem" & vbNewLine & xmlP.Status & " - " & xmlP.statusText
        Exit Sub
    End If


    htmlSubyDoc.body.innerHTML = xmlP.responseText


    Set htmlSubyResults = htmlSubyDoc.getElementsByClassName("pdpBox")


    For Each htmlSubyResult In htmlSubyResults


        'Debug.Print htmlSubyResult.innerHTML
        Set htmlStrongs = htmlSubyResult.getElementsByTagName("STRONG")
        For Each htmlStrong In htmlStrongs
            Debug.Print htmlStrong.innerHTML
            
        Next htmlStrong
    Next htmlSubyResult


    Application.ScreenUpdating = True
End Sub
 

Excel Facts

Test for Multiple Conditions in IF?
Use AND(test, test, test, test) or OR(test, test, test, ...) as the logical_test argument of IF.
I tried this class "CCPageText" but I got undesired results within my desired results ...


Have you tried the code that I posted in message #10 ? You need to set reference to Microsoft Internet Controls and Microsoft Internet object library before executing the code.
 
Upvote 0
Have you tried the code that I posted in message #10 ? You need to set reference to Microsoft Internet Controls and Microsoft Internet object library before executing the code.

That's great Ombir .. I just tested it. sorry for not testing it earlier .. but is there a way to do the task without IE as I have hundreds of similar links and using ie is slower ... Do you agree with me? but as for results are very excellent
Thanks a lot for great help
 
Upvote 0
is there a way to do the task without IE as I have hundreds of similar links and using ie is slower
You should be able to replace the IE lines with your XMLhttp code.

Here's my solution to your request:
Code:
Public Sub Test()
    
    Dim httpReq As MSXML2.XMLHTTP60
    Dim HTMLdoc As HTMLDocument
    Dim pElements As IHTMLElementCollection
    Dim pElement As HTMLParaElement
    Dim parasText As String
    Dim URL As String
    
    URL = "http://plants.newgarden.com/12190005/Plant/3394/Deodar_Cedar"
    
    Set httpReq = New MSXML2.XMLHTTP60
    With httpReq
        .Open "GET", URL, False
        .send
        If .Status <> 200 Then
            MsgBox URL & " http request error" & vbNewLine & .Status & " - " & .statusText
            Exit Sub
        End If
        Set HTMLdoc = New MSHTML.HTMLDocument
        HTMLdoc.body.innerHTML = .responseText
    End With
    
    Set pElements = HTMLdoc.getElementsByTagName("P")
            
    Set pElement = getPElement(pElements, "Height:")
    Debug.Print pElement.ChildNodes(1).NodeValue
    
    Set pElement = getPElement(pElements, "Spread:")
    Debug.Print pElement.ChildNodes(1).NodeValue
    
    Set pElement = getPElement(pElements, "Sunlight:")
    Debug.Print pElement.Children(1).Title
    
    Set pElement = getPElement(pElements, "Hardiness Zone:")
    Debug.Print pElement.ChildNodes(1).NodeValue
    
    Set pElement = getPElement(pElements, "Other Names:")
    Debug.Print pElement.ChildNodes(1).NodeValue
    
    Set pElement = getPElement(pElements, "Description:")
    parasText = getNextParasText(pElement)
    Debug.Print parasText
    
    Set pElement = getPElement(pElements, "Ornamental Features:")
    parasText = getNextParasText(pElement)
    Debug.Print parasText
    
    Set pElement = getPElement(pElements, "Landscape Attributes:")
    parasText = getNextParasText(pElement)
    Debug.Print parasText
    
    Set pElement = getPElement(pElements, "Plant Characteristics:")
    parasText = getNextParasText(pElement)
    Debug.Print parasText

End Sub


Private Function getPElement(pElements As IHTMLElementCollection, findText As String)
    
    Dim i As Long
    Dim pElement As HTMLParaElement
    
    Set getPElement = Nothing
    i = 0
    While i < pElements.Length And getPElement Is Nothing
        If InStr(pElements(i).innerText, findText) Then Set getPElement = pElements(i)
        i = i + 1
    Wend
    
End Function


Private Function getNextParasText(pElement As HTMLParaElement) As String

    Dim exitLoop As Boolean
    
    exitLoop = False
    getNextParasText = ""
    Do
        Set pElement = pElement.NextSibling
        If Not pElement Is Nothing Then
            If pElement.className = "CCPageText" Then
                If pElement.FirstChild.nodeName <> "STRONG" Then
                    getNextParasText = getNextParasText & Replace(pElement.innerText, vbCrLf, ", ") & " "
                Else
                    exitLoop = True
                End If
            End If
        Else
            exitLoop = True
        End If
    Loop Until exitLoop

End Function
 
Upvote 0
That's really fascinating and wonderful
Thank you very very much for that special code and thanks a lot for all who shares us the issue
Happy new year
 
Upvote 0
but is there a way to do the task without IE as I have hundreds of similar links and using ie is slower ... Do you agree with me?

Here is the solution using Xml approach and it is very fast.


Sub ByeBye2016()
Dim indx1 As Long
Dim indx2 As Long
Dim htmltext As String
Dim output As Variant
Dim reptags As Variant
Dim details As Variant
Dim req As MSXML2.XMLHTTP60
Dim doc As HTMLDocument
Dim plant As HTMLDivElement

reptags = Array(" class=CCPageText", "<STRONG>", "</STRONG>", "</P>", _
"<!-- Plant Descriptions -->", "<P", "<LI>", "</LI>", "<UL", "</UL>", ">")

details = Array("Other Names:", "Description:", "Ornamental Features:", "Landscape Attributes:", "Plant Characteristics:")

Set req = New MSXML2.XMLHTTP60
With req
.Open "GET", "http://plants.newgarden.com/12190005/Plant/3394/Deodar_Cedar", False
.send
If .Status <> 200 Then
MsgBox "Http Request Error"
Exit Sub
End If
Set doc = New MSHTML.HTMLDocument
doc.body.innerHTML = .responseText
End With

ReDim output(1 To 11)

Set plant = doc.getElementsByClassName("pdpBox")(0)
htmltext = plant.innerHTML

htmltext = Trim(htmltext)

For Each ele In reptags
htmltext = Replace(htmltext, ele, "")
Next

output(1) = doc.getElementsByClassName("pdpPlantName")(0).getElementsByTagName("p")(0).innerText
output(2) = doc.getElementsByClassName("pdpPlantName")(0).getElementsByTagName("p")(1).innerText
output(3) = Split(doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("p")(0).innerText, ":")(1)
output(4) = Split(doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("p")(1).innerText, ":")(1)
output(5) = doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("img")(0).Title
output(6) = Split(doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("p")(3).innerText, ":")(1)

For i = 0 To UBound(details)
indx1 = InStr(htmltext, details(i))
If i = UBound(details) Then
indx2 = 19999
Else
indx2 = InStr(htmltext, details(i + 1))
End If
output(i + 1 + 6) = Split(Mid(htmltext, indx1, indx2 - indx1), ":")(1)
Next

Range("A1:K1") = Array("Name", "Sci Name", "Height", "Spread", "Sunlight", "Hardiness Zone", "Other Names", _
"Description", "Ornamental Features", "Landscape Attributes", "Plant Characteristics")
Range("A2").Resize(, UBound(output)) = output
End Sub


<ul", "<="" ul=""></ul",></p",>
 
Upvote 0
That's incredible Ombir. Really incredible and very fast
Can you tell me the secret behind the speed if you don't mind?
You are a LEGEND
 
Upvote 0
Last edited:
Upvote 0
Modified it as per your new requirement.

MSXML method writes the html source of webpage to a local variable and all manipulation is done on local variable. Therefore it is faster than IE.

Sub ByeBye2016()
Dim i As Long
Dim indx1 As Long
Dim indx2 As Long
Dim htmltext As String
Dim output As Variant
Dim reptags As Variant
Dim details As Variant
Dim req As MSXML2.XMLHTTP60
Dim doc As HTMLDocument
Dim plant As HTMLDivElement

reptags = Array(" class=CCPageText", "<STRONG>", "</STRONG>", "</P>", _
"<!-- Plant Descriptions -->", "<P", "<LI>", "</LI>", "<UL", "</UL>", "<B", "</B", ">")

details = Array("Other Names:", "Description:", "Ornamental Features:", "Landscape Attributes:", "Plant Characteristics:")

Set req = New MSXML2.XMLHTTP60
With req
.Open "GET", "http://plants.newgarden.com/12190005/Plant/1663/Moonfire_Japanese_Maple", False
.send
If .Status <> 200 Then
MsgBox "Http Request Error"
Exit Sub
End If
Set doc = New MSHTML.HTMLDocument
doc.body.innerHTML = .responseText
End With

ReDim output(1 To 11)

Set plant = doc.getElementsByClassName("pdpBox")(0)
htmltext = plant.innerHTML

htmltext = Trim(htmltext)

For Each ele In reptags
htmltext = Replace(htmltext, ele, "")
Next

output(1) = doc.getElementsByClassName("pdpPlantName")(0).getElementsByTagName("p")(0).innerText
output(2) = doc.getElementsByClassName("pdpPlantName")(0).getElementsByTagName("p")(1).innerText
output(3) = Split(doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("p")(0).innerText, ":")(1)
output(4) = Split(doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("p")(1).innerText, ":")(1)
output(5) = doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("img")(0).Title
output(6) = Split(doc.getElementsByClassName("pdpQuickFactsBox")(0).getElementsByTagName("p")(3).innerText, ":")(1)

For i = 0 To UBound(details)
indx1 = InStr(htmltext, details(i))
If i = UBound(details) Then
indx2 = 19999
Else
indx2 = InStr(htmltext, details(i + 1))
End If
If indx1 <> 0 And indx2 <> 0 Then
output(i + 1 + 6) = Split(Mid(htmltext, indx1, indx2 - indx1), ":")(1)
End If
Next

Range("A1:K1") = Array("Name", "Sci Name", "Height", "Spread", "Sunlight", "Hardiness Zone", "Other Names", _
"Description", "Ornamental Features", "Landscape Attributes", "Plant Characteristics")
Range("A2").Resize(, UBound(output)) = output
End Sub<ul", "<="" ul=""><b", "<="" b",="" "="">
<ul", "<="" ul=""><b", "<="" b",="" "=""></b",></ul",></p",></b",></ul",>
 
Last edited:
Upvote 0

Forum statistics

Threads
1,214,982
Messages
6,122,580
Members
449,089
Latest member
Motoracer88

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top