Search string in PDF return page number

masterc

New Member
Joined
Feb 12, 2019
Messages
5


Dear all,





Please excuse me as I am new to this forum however here ismy question.




I have a excel file with strings in column A.

A button on that sheet that is linked to the code below.

What it should do is find each string in a pdf file I selectafter using the button.



I don’t see the pdf file when the ribbon open file menu opens.

I am using excel 2016 and adobe reader dc.

I am no expert in vba,found this code

Sub BatchRenameCS()

Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()
Sheets("Sheet1").Range("C:D").ClearContents
strFileName = selectFile()
Folder = GetFolder()
'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
If objPDDoc.Open(strFileName) Then
Set objjso = objPDDoc.GetJSObject
ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long
For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
For c = 0 To objjso.GetPageNumWords(Page - 1)
PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
Next c
For i = 1 To Len(PDFCharacters)
Select Case Asc(Mid(PDFCharacters, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
Case Else
PDFCharacters2 = PDFCharacters2 & ""
End Select
Next
PDFCharacterCount(Page) = Len(PDFCharacters2)
Next Page
lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
strResult = ""
strSource = Sheets("Sheet2").Cells(Cell, 1).Text
PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
For i = 1 To Len(strSource)
Select Case Asc(Mid(strSource, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
strResult = strResult & (Mid(strSource, i, 1))
Case Else
strResult = strResult & ""
End Select
Next
CharacterCount = CharacterCount + Len(strResult)
If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If
Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
For PasteDataPage = 1 To objPDDoc.GetNumPages
If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
If FileExist(Folder & "" & LookUpCell.Offset(0, 1) & ".pdf") Then
Set newPDF = CreateObject("AcroExch.pdDoc")
NewName = Folder & "" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.Open (NewName)
newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Else
Set newPDF = CreateObject("AcroExch.PDDoc")
newPDF.Create
NewName = Folder & "" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
End If
End If
Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
If Check(1, PasteDataPage) <> 1 Then
Sheets("Sheet1").Cells(x, 3) = PasteDataPage
Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
x = x + 1
End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
If fd.SelectedItems(1) <> vbNullString Then
fileName = fd.SelectedItems(1)
End If
Else
'Exit code if no file is selected
End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
Dim fldr As FileDialog
Dim sItem As String
Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
With fldr
.Title = "Select the Folder where you want you new PDFs to go"
.AllowMultiSelect = False
.InitialFileName = Application.DefaultFilePath
If .Show <> -1 Then GoTo NextCode
sItem = .SelectedItems(1)
End With
NextCode:
GetFolder = sItem
Set fldr = Nothing
End Function

 

Excel Facts

Enter current date or time
Ctrl+: enters current time. Ctrl+; enters current date. Use Ctrl+: Ctrl+; Enter for current date & time.
I got it to work with following code however I need help to change it to select a pdf manually and input a list of strings.
Please also help me with to return page numbers when found that would be appreciated

Code:
Sub AcrobatFindText2() 'variables
Dim Resp 'For message box responses
Dim gPDFPath As String
Dim sText As String 'String to search for
Dim sStr As String 'Message string
Dim foundText As Long 'Holds return value from "FindText" method
Dim caseSensitive As Long
caseSensitive = 1000
Dim WholeWords As Long
WholeWords = 1000
Dim bReset As Long
bReset = 1000
   
'Dim ln0 As Long
'Dim ln1 As Long
'ln0 = 0
'ln1 = 1
  
'hard coding for a PDF to open, it can be changed when needed.
gPDFPath = "C:\Users\320053373\Desktop\Search\DFU\421335439514_114.pdf"
'Initialize Acrobat by creating App object
Set gApp = CreateObject("AcroExch.App", "")
gApp.Hide
'Set AVDoc object
Set gAvDoc = CreateObject("AcroExch.AVDoc")
' open the PDF
If gAvDoc.Open(gPDFPath, "") Then
sText = "environment"
  

'Function FindText(szText As String, bCaseSensitive As Long, bWholeWordsOnly As Long, bReset As Long) As Boolean
foundText = gAvDoc.FindText(sText, caseSensitive, WholeWords, bReset)  'Returns -1 if found, 0 otherwise

  
Debug.Print foundText
Else ' if failed, show error message
Resp = MsgBox("Cannot open" & gPDFPath, vbOKOnly)
End If
If foundText = -1 Then
'compose a message
sStr = "Found: " & sText
Resp = MsgBox(sStr, vbOKOnly)
Else ' if failed, 'show error message
Resp = MsgBox("Cannot find: " & sText, vbOKOnly)
End If
'Visa dokument
gApp.Show
gAvDoc.BringToFront
  
End Sub
 
Upvote 0
The FindText function searches the whole file and doesn't return the page number where the string was found. Therefore a different method is needed.

This code prompts for the PDF file and searches all the strings in column A starting at A2 (one string per cell) and puts the page number(s) where found in the adjacent cell.

Code:
Public Sub Search_Strings_in_PDF_Pages()

    Dim AcroApp As CAcroApp
    Dim AcroPDDoc As CAcroPDDoc
    Dim AcroHiliteList As CAcroHiliteList
    Dim TextSelect As CAcroPDTextSelect
    Dim Page As CAcroPDPage
    Dim PageContent As CAcroHiliteList
    Dim PDFfile As String
    Dim searchStringCells As Range, searchString As Range
    Dim p As Long, i As Long
    Dim PageText As String
    Dim foundPageNumbers As String
    
    'Search selected cells
    'Set searchStringCells = Selection
    
    'Or column A cells
    With ActiveSheet
        Set searchStringCells = .Range("A2", .Cells(.Rows.Count, "A").End(xlUp))
    End With
    
    PDFfile = Application.GetOpenFilename("PDF Files (*.pdf), *.pdf", Title:="Select the PDF file to search")
    If PDFfile = "False" Then Exit Sub
        
    Set AcroApp = CreateObject("AcroExch.App")
    Set AcroPDDoc = CreateObject("AcroExch.PDDoc")
    
    'Open the PDF
    
    If Not AcroPDDoc.Open(PDFfile) Then
        MsgBox PDFfile & " couldn't be opened - macro exiting"
        Exit Sub
    End If
    
    searchStringCells.Offset(, 1).Cells.Clear
    
    'Read each page
    
    For p = 0 To AcroPDDoc.GetNumPages - 1
    
        Set Page = AcroPDDoc.AcquirePage(p)
        Set PageContent = CreateObject("AcroExch.HiliteList")
    
        'Get up to 9,999 words from this page
        
        If PageContent.Add(0, 9999) Then
           
            Set TextSelect = Page.CreatePageHilite(PageContent)
       
            If Not TextSelect Is Nothing Then
            
                'Extract all page content into a text string
                
                PageText = ""
                For i = 0 To TextSelect.GetNumText - 1
                    PageText = PageText & TextSelect.GetText(i)
                Next
                
                'Search for each cell string in this page
                
                For Each searchString In searchStringCells
                    foundPageNumbers = searchString.Offset(, 1).Value
                    If InStr(1, PageText, searchString.Value, vbTextCompare) > 0 Then
                        If foundPageNumbers = "" Then
                            foundPageNumbers = p + 1
                        Else
                            foundPageNumbers = foundPageNumbers & ", " & p + 1
                        End If
                        searchString.Offset(, 1).Value = foundPageNumbers
                    End If
                Next
                
            End If
            
        Else
    
            MsgBox "PageContent.Add error on page " & p + 1 & " - page not searched"
   
        End If
    
    Next
    
    AcroPDDoc.Close
    AcroApp.Exit

    MsgBox "Finished"
    
End Sub
 
Upvote 0
I get a compile error user-defined type not defined.
HQhdiDS


HQhdiDS
 
Upvote 0
Sorry, set a reference (via Tools > References in the VBA editor) to Adobe Acrobat XX.0 Type Library, where XX.0 is your version of Acrobat.
 
Upvote 0
Excellent! Glad it works for you :)

John,

Thanks for the super useful code. I'm unfortunately gettnig the same "compile error user-defined type not defined" error. I have tried to set the reference but only have "Acrobat Access 3.0 Type Library" "Adobe Acrobat 7.0 Browser Conrol Type Libary 1.0" and "Adobe Reader File Preview Type Library" listed as available references and none seem to do the job.

Any ideas on how I would go about solving this (apologies I have an extremely basic understanding of the area). Excel 2016 and Adobe Acrobat Reader DC. Thanks!
 
Upvote 0
@John_w - This code is really helpful for a use case I am exploring. One issue I've noticed is that the code returns strings that contain the contents of the cell rather than exactly match it (i.e. if A1 contains "compl" it will return the pages on which "completed" and "completing" exist. Would you be able to help me narrow it down so that it only returns exact matches? Thank you so much again.
 
Upvote 0

Forum statistics

Threads
1,216,101
Messages
6,128,842
Members
449,471
Latest member
lachbee

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top