just extract the alphanumeric characters

vacation

Board Regular
Joined
Dec 6, 2003
Messages
56
Hi,

In column C, there are over 30,000 items of data that look like this:

...
R#CS2719==
R#CS2653=
R#CS2720=
R#CS2654=
R#CS2770===
BMHH235.5===#
BMFA63023.1=%
BMFA63023===)
BMHP286.1===$
BMHP286=====!
BMRS192A1===~
BMFD280.1===#
BMFD281.1===#
BMHD196======
BMRS283=====(
BMH34641====~
...

They contain all kinds of non-alphanumeric characters.
Note: All possible non-alphanumeric characters are not shown above.
The above is just a small sample.

I would like column E to contain data from the corresponding cells in
column C but exclude all the non-alphanumeric characters.
So the above column C data would appear in column E as:

...
RCS2719
RCS2653
RCS2720
RCS2654
RCS2770
BMHH2355
BMFA630231
BMFA63023
BMHP2861
BMHP286
BMRS192A1
BMFD2801
BMFD2811
BMHD196
BMRS283
BMH34641
...


What combination of nested functions should I assign to each cell in column E to achieve this?


Thanks.
 
Thank you, DreamBoat.

Dave,

Since I ultimately need to use my entire string, I adapted your excellent examples (you saved me hours of work) as below. Basically, I just extract the strings into an array an then replace them when I am done with my other processing.

Comparing your earlier example with the most recent one, why did you change the patterns in this manner?

1a. First example Dec 9, 2003: "([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})"

b. Yesterday example: "(\$?)([A-Z]{1,2})(\$?)(\d{1,5})" - I understood the first example but not the second one, or did you change it because of the embedded string part of my posts over the weekend?

2. Yesterday example: .Pattern = """.+?""" - I had gotten to """.*""" just working on it myself, but of course that matched everything from the first double quote to the last one - I believe ? matches 0 or 1 instance, but what does the + do?

I appreciate you bearing with my questions. I can follow what you are doing most of the time. This has been a fantastic learning experience, not only for my current project but also for some other things looming on the horizon for me. Many, Many Thanks!

Code:
Option Explicit

Private mMatch As Variant
Private mMatches As Variant
Private mSubmatch As Variant
Private mRegExpr As Variant

Private Const mCELL_ADDRESS_PATTERN = "([$]{0,1})([A-Z]{1,COL_LEN})([$]{0,1})(\d{1,NUM_LEN})"
Private Const mTOKEN_STRING = "T_O_K_E_N_"

Public Sub BuildFormula(Optional CurrentSheet As Variant)
Dim rng As Range, cell As Range
Dim lngTemp As Long
Dim varTemp As Variant, varArray As Variant
Dim strFormula As String, strPattern As String
    On Error Resume Next
    'Default to active sheet
    If IsMissing(CurrentSheet) Then
        Set CurrentSheet = ActiveSheet
    'Allow sheet object or sheet name or sheet index
    ElseIf Not IsObject(CurrentSheet) Then
        Set CurrentSheet = Sheets(CurrentSheet)
    End If
    'Look at formulas only, error check prevents error if no formulas in the sheet
    Set rng = CurrentSheet.Cells.SpecialCells(xlFormulas)
    On Error GoTo 0
    'Exit if there are no formulas
    If IsNothing(rng) Then
        Exit Sub
    End If
    'Set application properties
    With Application
        .ScreenUpdating = False
        .Calculation = xlCalculationManual
    End With
    'Create module level regular expression objects
    Call CreateRegExpr
    strPattern = GetCellAddressPattern
    For Each cell In rng.Cells
        varArray = varTemp
        'Tokenize any valid cell addresses which may be embedded within string
        strFormula = TokenizeFormula(cell.Formula, varArray)
        'Do my work here
        Debug.Print
        Debug.Print "Before: " & strFormula
        strFormula = ReplaceRef(strFormula, strPattern, "Abs", 2)
        Debug.Print "After:  " & strFormula
        Call ListMatches(strFormula, strPattern)
        'If embedded strings processed then reverse the tokenization process
        If IsArray(varArray) Then
            cell.Formula = ReverseTokenizeFormula(strFormula, varArray)
        End If
    Next
    With Application
        .ScreenUpdating = True
        .Calculation = xlCalculationAutomatic
    End With
    Set rng = Nothing: Set cell = Nothing
    Set mMatch = Nothing: Set mMatches = Nothing: Set mSubmatch = Nothing: Set mRegExpr = Nothing
End Sub 'BuildFormula

Private Sub CreateRegExpr(Optional PatternText As String, _
                          Optional SearchAll As Boolean, _
                          Optional IgnoreCase As Boolean)
    'Create regular expression object
    Set mRegExpr = CreateObject("vbscript.regexp")
    With mRegExpr
        mRegExpr.Global = SearchAll
        mRegExpr.Pattern = PatternText
        mRegExpr.IgnoreCase = IgnoreCase
    End With
End Sub 'CreateRegExpr

Private Function TokenizeFormula(ByVal FormulaText As String, _
                                 ByRef ArrayText As Variant) As String
Dim lngTemp As Long
Dim strTemp As String
    With mRegExpr
        .Global = True
        .IgnoreCase = True
'glw4        .Pattern = """.*"""
        .Pattern = """.+?"""
        Set mMatches = .Execute(FormulaText)
    End With
    If mMatches.Count Then
        For Each mMatch In mMatches
            If IsArray(ArrayText) Then
                lngTemp = lngTemp + 1
                ReDim Preserve ArrayText(lngTemp)
            Else
                ReDim ArrayText(lngTemp)
            End If
            With mMatch
                ArrayText(lngTemp) = .Value
                strTemp = mTOKEN_STRING & lngTemp
                FormulaText = Replace$(FormulaText, .Value, strTemp)
            End With
        Next mMatch
    End If
    TokenizeFormula = FormulaText
End Function 'TokenizeFormula

'AddressType = Abs $A$1, Rel A1, Row A$1, Col $A1
Private Function ReplaceRef(SearchText As String, _
                            PatternText As String, _
                            AddressType As String, _
                            FormulaRef As Integer)
Dim NewForm As String
    With mRegExpr
        .Global = True
        .Pattern = PatternText
        .IgnoreCase = True
        Set mMatches = .Execute(SearchText)
    End With
    'If not enough matches we are finished
    If mMatches.Count < FormulaRef Then
        ReplaceRef = SearchText
        Exit Function
    End If
    'Submatches are zero based so 1st Submatch = Match(n-1)
    Set mSubmatch = mMatches(FormulaRef - 1).Submatches
    Select Case Application.WorksheetFunction.Proper(AddressType)
        Case "Rel"
            NewForm = mSubmatch(1) & mSubmatch(3)
        Case "Abs"
            NewForm = "$" & mSubmatch(1) & "$" & mSubmatch(3)
        Case "Col"
            NewForm = "$" & mSubmatch(1) & mSubmatch(3)
        Case "Row"
            NewForm = mSubmatch(1) & "$" & mSubmatch(3)
        Case Else
            'conversion input was invalid
            ReplaceRef = SearchText
            Exit Function
    End Select
    ReplaceRef = Application.WorksheetFunction.Replace(SearchText, mMatches(FormulaRef - 1).FirstIndex + 1, Len(mMatches(FormulaRef - 1)), NewForm)
End Function 'ReplaceRef

Private Sub ListMatches(SearchText As String, PatternText As String)
Dim RegEx, Submatch, Match, Matches
Dim NewForm As String
    With mRegExpr
        .Global = True
        .Pattern = PatternText
        .IgnoreCase = True
        Set Matches = .Execute(SearchText)
    End With
    If Matches.Count Then
        Debug.Print
        Debug.Print SearchText, PatternText
        For Each Match In Matches
            With Match
                Debug.Print "Match found at " & .FirstIndex & ", value is " & .Value & ", Length is " & .Length
            End With
        Next Match
    End If
End Sub 'ListMatches

Private Function GetCellAddressPattern() As String
Dim lngTemp As Long
Dim strTemp As String
    strTemp = mCELL_ADDRESS_PATTERN
    'Calculate width of maximum row number
    lngTemp = Len(Trim$(Str$(Cells.Rows.Count)))
    strTemp = Replace$(strTemp, "NUM_LEN", lngTemp)
    'Calculate width of maximum columns
    lngTemp = 2
    'First set includes single values (eg a thru z) + 26 sets of doubles (eg aa thru zz)
    Do While 27 * 26 ^ (lngTemp - 1) < Cells.Columns.Count
        lngTemp = lngTemp + 1
    Loop
    strTemp = Replace$(strTemp, "COL_LEN", lngTemp)
    GetCellAddressPattern = strTemp
End Function 'GetCellAddressPattern

Private Function ReverseTokenizeFormula(ByVal FormulaText As String, _
                                        ByRef ArrayText As Variant) As String
Dim lngTemp As Long
Dim strTemp As String
    For lngTemp = LBound(ArrayText) To UBound(ArrayText)
        strTemp = mTOKEN_STRING & lngTemp
        FormulaText = Replace$(FormulaText, strTemp, ArrayText(lngTemp))
    Next lngTemp
    ReverseTokenizeFormula = FormulaText
End Function 'ReverseTokenizeFormula

Private Function IsNothing(Parm As Variant)
    On Error Resume Next
    IsNothing = (Parm Is Nothing)
    Err.Clear
    On Error GoTo 0
End Function 'IsNothing
 
Upvote 0

Excel Facts

Convert text numbers to real numbers
Select a column containing text numbers. Press Alt+D E F to quickly convert text to numbers. Faster than "Convert to Number"
sbendbuckeye said:
Thank you, DreamBoat.

Dave,

Since I ultimately need to use my entire string, I adapted your excellent examples (you saved me hours of work) as below. Basically, I just extract the strings into an array an then replace them when I am done with my other processing.

Comparing your earlier example with the most recent one, why did you change the patterns in this manner?

1a. First example Dec 9, 2003: "([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})"

b. Yesterday example: "(\$?)([A-Z]{1,2})(\$?)(\d{1,5})" - I understood the first example but not the second one, or did you change it because of the embedded string part of my posts over the weekend?

2. Yesterday example: .Pattern = """.+?""" - I had gotten to """.*""" just working on it myself, but of course that matched everything from the first double quote to the last one - I believe ? matches 0 or 1 instance, but what does the + do?

I appreciate you bearing with my questions. I can follow what you are doing most of the time. This has been a fantastic learning experience, not only for my current project but also for some other things looming on the horizon for me. Many, Many Thanks!

No problem :)

Q1

"([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})"
is identical to
"(\$?)([A-Z]{1,2})(\$?)(\d{1,5})"

but the second pattern is a little tidier

The difference is the optional match for the absolute reference "$" which I changed from
([$]{0,1})
to
(\$?)

The "\$" says find a "$", the "\" portion is necessary to distinguish finding a "$" character as opposed to the special RegExp charcter match for the end of a string. My earlier pattern used "[$]" for the same ends

Then the "?" portion is equivalent to matching the $ zero or once, ie the same as "{0,1}" that I used on Dec 9 2003.

Q2

The """.+?""" is a sneakier pattern which I learnt when experimenting with removing html tags.

If you use a pattern such as
""".+"""
then this says match one or more characters surrounded by quotes. For a a string with multiple matches (as you found with your similar pattern) it returns everything between the first and last quotes.

ie, applying this pattern to

"fred" XX "Jones"
gives
"fred" XX "Jones"

But if you try
""".+?"""
it makes a non greedy match that returns two matches and ignores the XX portion

"fred"
"Jones"

Note that your pattern
""".*""" matches a set of quotes with none or more characters whereas
the pattern I used
""".+""" matches a set of quotes that contain 1 or more characters

Cheers

Dave
 
Upvote 0
Thanks for all of the help Dave! I have really learned a lot from this thread. For now, I am using your first version because it is a little bit more intuitive to me since this is all new.

"([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})"

I have encountered one small problem with named ranges when using the above code. Here is what I am doing:

A. Create a "tokenized" string to remove embedded double quotes, etc. I keep the Match.Value and its "tokenized" equivalent in an array so I can reverse the process at the end.

=A1&"B1&C1"&D1&"E1&F1"&G1 becomes
=A1&T_O_K_N_0&D1&T_O_K_E_N_1

Its pretty ugly but it works

B. The problem I am having is that some of the client's named ranges in formulas cause an erroneous match using the above Patter string (eg ProdSpeed100 matches ED100).
I believe what I need is a Not alphabetic piece first to prevent matches like the named range example since a valid cell address can be preceeded by = ( ) + - * / ^ [ ] ! ', etc.

I am not quite sure how to go about this so if you could point me once again I would be grateful.

C. In addition to the quoted things we discussed earlier, I am also checking for """" before any other strings. Should I also be checking for """ and "".

Thanks for any ideas, suggestions, constructive criticism, etc!
 
Upvote 0
No problem Gary :)

A)

Sounds reasonable

B)

"ProdSpeed100" won't match if you set ignorecase property to False as the RegExp pattern is looking for A-Z

But yes, PRODSPEED100 will give a false match

So, how about
(?:[^A-Z$]|^)([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})

The (?:[^A-Z$]|^)

Says ensure that the character before a valid Column reference (A-ZZ for the RegExp) is either not alphabetic, is not a "$" sign, or is the start of a string. It is necessary to include the "$" criteria as otherwise
A$ED100 is valid.

Alternatively the match could be made on valid characters, ie =()[]+-*^/: etc
(?:[\^\-\+\*\/:,\]\[\(\)])([$]{0,1})([A-Z]{1,2})([$]{0,1})(\d{1,5})

The ?: portion at the front says do not return this submatch for later use so you dont need to change the portion of the code that refers to submatch position

(C)

I'm not sure that I understand your here

The Pattern
.Pattern = """.+?"""
is looking for single quote marks within a string. If you wanted to replace the contents of double quote marks then try
.Pattern = """".+?""""

Cheers

Dave
 
Upvote 0
Thanks, Dave!

I must be getting a little better at this, I had come up with [^A-Za-z_-] as my first attemp before reading your post this morning.

How do you go about debugging a pattern string? As an example, in your all possible list above you escaped multiple characters with the \. But, if I did that list I'm sure I would have forgotten to do that for at least one of them and then the match would just fail. Thanks again for all of your help!
 
Upvote 0
Yours was close, you don't need the a-z if ignorecase is set to False.

I use http://www.electrified.net/dev/RegexEvaluate.aspx to test patterns

I used the "/" character to denote a literal match when I wanted to match the special character. This is called "escaping" in RegExp speak

For example if we use [^-] then this is RegExp for a negative character set that would match a character that wasn't "-" whereas
[/^-]
means match a "^" or "-"

Likewise I needed to use "/]" to match "]" rather than have the pattern think I had closed my range of characters

As the ":" and "," dont have any special meaning then they don't need the "/" preceding them

I did use a couple more "/" than I strictly needed to. :)
maybe this
(?:[\^\/\])\-+*:,[(])(\$?)([A-Z]{1,2})(\$?)(\d{1,5})
to match ^/-+*:,()[] if they are the first characters you want.

Maybe we should kick off a new thread from here, its deviated somewhat from the orginal question :LOL:

Cheers

Dave
 
Upvote 0
Gary,

I updated the pattern to
(?:[\^\])-/+*:,="[(])(\$?)([A-Z]{1,2})(\$?)(\d{1,5})

I also built in a check into the ReplaceRef Function to ensure that the row number was less than 65537

This changes mean that the string match length is changed and the ReplaceRef Function needs updating as below

Cheers

Dave


Code:
Sub RegExp_Early_Execute_5()

Dim Myrange As Range, Cel As Range

    On Error Resume Next
    ' Look at formulas only, the error check prevents an error if there are no formulas in the selection
    ' set the Excel range to parse the ActiveSheet B3:B12
    Set Myrange = Intersect(ActiveSheet.Range("B3:B12"), ActiveSheet.Cells.SpecialCells(xlFormulas))
    On Error GoTo 0

    ' Leave Main sub if there are no formulas
    If Myrange Is Nothing Then Exit Sub
    With Application
        .ScreenUpdating = False
        .Calculation = xlCalculationManual
    End With

    For Each Cel In Myrange.Cells
        ' Four valid arguments
        ' "Rel" = Relative Referenced
        ' "Abs" = Absolute Referenced
        ' "Row" = Absolute Row Referenced
        ' "Col" = Absolute Column Referenced

        'This example converts every second cell reference of each formula in the selection to Absolute
        'ReplaceRef Function = (cell formula, Desired Reference Type, formula ref to be changed)
        'Cel.Formula = ReplaceRef(Cel.Formula, "Abs", 2)

        'Now set to be run from the sheet drop down validation
        Cel.Formula = ReplaceRef(Cel.Formula, Range("B15"), Range("B16"))

    Next
    With Application
        .ScreenUpdating = True
        .Calculation = xlCalculationAutomatic
    End With
End Sub

Function ReplaceRef(Str1 As String, ReplStr As String, FormulaRef As Integer)
'Reference must be set to Microsoft VbScript Regular Expression 5.5
'Dimension the RegExp objects
Dim RegEx As VBScript_RegExp_55.RegExp, RegMatchCollection As VBScript_RegExp_55.MatchCollection
Dim SubMatch   As VBScript_RegExp_55.SubMatches
Dim NewStr     As String
Dim PosMatch As Integer, LenMatch As Integer
Dim EndString As Boolean, LongString As Boolean

    ' create the RegExp Object with early binding
    Set RegEx = New VBScript_RegExp_55.RegExp

    With RegEx
        'Look for all formula references
        .Global = True
        'Match case to avoid matching lower case alphabetic charcaters
        .IgnoreCase = False
        'Look for cell reference
        'Dave's original pattern was (?:[\^\])-/+*:,="[(])(\$?)([A-Z]{1,2})(\$?)(\d{1,5})
        .Pattern = (?:[\^\])-/+*:,="[(])(\$?)([A-Z]{1,2})(\$?)(\d{1,5})

    End With
    Set RegMatchCollection = RegEx.Execute(Str1)
    'If the formula is not long enough then dont look at submatches
    If RegMatchCollection.Count >= FormulaRef Then
        'Parse the appropriate formula reference
        Set SubMatch = RegMatchCollection(FormulaRef - 1).SubMatches
        'If row number exceeds 65536 then the parsed string is not a valid formula
        If SubMatch(3) < 65537 Then

            ' Check to see whether the the first character after the submatch of the row number
            ' is the end of the string or is numeric
            ' If the first character is numeric then the row match is invalid

            PosMatch = RegMatchCollection(FormulaRef - 1).FirstIndex + 2
            LenMatch = RegMatchCollection(FormulaRef - 1).Length - 1

            EndString = Len(Str1) = PosMatch + LenMatch - 1
            If Not (EndString) Then
                LongString = Not (IsNumeric(Mid(Str1, PosMatch + LenMatch, 1)))
            End If

            If EndString Or LongString Then
                ' Change the formula to Relative, Absolute, Column Absolute or Row Absolute
                Select Case Application.WorksheetFunction.Proper(ReplStr)
                    Case "Rel"
                        NewStr = SubMatch(1) & SubMatch(3)
                    Case "Abs"
                        NewStr = "$" & SubMatch(1) & "$" & SubMatch(3)
                    Case "Col"
                        NewStr = "$" & SubMatch(1) & SubMatch(3)
                    Case "Row"
                        NewStr = SubMatch(1) & "$" & SubMatch(3)
                    Case Else
                        'conversion input was invalid so return initial string
                End Select

            End If

        End If
    End If

    If NewStr = "" Then
        ReplaceRef = Str1
    Else
        'Make the change to the formula using the position and length of the match
        ReplaceRef = Application.WorksheetFunction.Replace(Str1, PosMatch, LenMatch, NewStr)
    End If

End Function
 
Upvote 0
I have a similar problem, can you help me to extract the column G values which has text in Alphanumeric form e.g

"Level P2, Final Inspection for electrical and ELV service piror to closing of Gypsum wall in room No. AP2L013. Grid 19-19.5/AD-AE,"

to extract only Room No. only which is always between 6-8 Character Alphanumeric digits.

"AP2L013" (Required Answer)

thanks in advance
 
Upvote 0
Code:
Function RemoveSymbols(cellValue As String) As String
        valueLen = Len(cellValue)
        
        For pos = 1 To valueLen
            cellChar = UCase(Left(cellValue, 1))
            If cellChar = "A" Or cellChar = "B" Or cellChar = "C" Or cellChar = "D" Or cellChar = "E" Or cellChar = "F" Or cellChar = "G" Or _
                cellChar = "H" Or cellChar = "I" Or cellChar = "J" Or cellChar = "K" Or cellChar = "L" Or cellChar = "M" Or cellChar = "N" Or _
                cellChar = "O" Or cellChar = "P" Or cellChar = "Q" Or cellChar = "R" Or cellChar = "S" Or cellChar = "T" Or cellChar = "U" Or _
                cellChar = "V" Or cellChar = "W" Or cellChar = "X" Or cellChar = "Y" Or cellChar = "Z" Or IsNumeric(cellChar) Then
            
                newCellValue = newCellValue & cellChar
            End If
            valueLen = valueLen - 1
            cellValue = Right(cellValue, valueLen)
        Next pos
            RemoveSymbols = newCellValue
            newCellValue = ""
End Function
Tana-Lee

Great formula! I'm trying to modify it so that instead of removing all non alphanumeric characters it replaces them with a dash, like "-".

I'm competent with Excel formulas but VB code is beyond me! My 10 minutes of trying to change the code to make it work has got me nowhere.

Is anyone able to help with the required change?

Thanks in advance.

Liam
 
Upvote 0

Forum statistics

Threads
1,216,099
Messages
6,128,822
Members
449,470
Latest member
Subhash Chand

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top