Search code examples
excelvbanumbersextract

Extracting groups of numbers from a text string and separate into columns


I have found a script that does this task however it seems slow and does not deal with decimal points. I am using a mac and cannot use regular expressions.

Examples of the data set (quite messy)are as follows

$2900 + Slab ($550) + P/Prep ($350) + Xover ($500)
$2600 + Pave Prep ($350) + Slab ($550) + Crossover ($500)
$2900 + $350 P/Prep + $500 Xover
2800+Slab$480+PavePrep$315+ Pave Prep$7.00+XOver$450
$3350(inc prep up to 50m2)+Slab$650+Extra P/prep$224+Xover$450
WM02-3050 XO-450 PPEX-128 SS-650
WM01-2850 XO-450 SS-650
XO-450 PPEX-307.68 SS-650 WM03-3350 DRYLINE-0
XO-450 PPEX-126.08 SS-650 WM01-2850

Don't need the numbers that are identifier eg. WM01, WM03, 50m2 Happy to remove small numbers <10

Would like the output to be

2900 550 350 500
2600 350 550 500
2900 350 500
2800 480 315 7 450
3350 650 224 450
3050 450 128 650
2850 450 650
450 307.68 3350
450 126.08 650 2850

I can then use the text to columns function. Thanks

Function NDigits(ByVal SourceString As String, _
    Optional ByVal NumberOfDigits As Long = 0, _
    Optional ByVal TargetDelimiter As String = " ") As String

Dim i As Long         ' SourceString Character Counter
Dim strDel As String  ' Current Target String

' Check if SourceString is empty (""). Exit if. NDigits = "".
If SourceString = "" Then Exit Function

' Loop through characters of SourceString.
For i = 1 To Len(SourceString)
    ' Check if current character is not a digit (#), then replace with " ".
    If Not Mid(SourceString, i, 1) Like "#" Then _
            Mid(SourceString, i, 1) = " "
Next

' Note: While VBA's Trim function removes spaces before and after a string,
'       Excel's Trim function additionally removes redundant spaces, i.e.
'       doesn't 'allow' more than one space, between words.
' Remove all spaces from SourceString except single spaces between words.
strDel = Application.WorksheetFunction.Trim(SourceString)

' Check if current TargetString is empty (""). Exit if. NDigits = "".
If strDel = "" Then Exit Function

' Replace (Substitute) " " with TargetDelimiter if it is different than
' " " and is not a number (#).
If TargetDelimiter <> " " And Not TargetDelimiter Like "#" Then
    strDel = WorksheetFunction.Substitute(strDel, " ", TargetDelimiter)
End If

' Check if NumberOfDigits is greater than 0.
If NumberOfDigits > 0 Then

    Dim vnt As Variant  ' Number of Digits Array (NOD Array)
    Dim k As Long       ' NOD Array Element Counter

    ' Write (Split) Digit Groups from Current Target String to NOD Array.
    vnt = Split(strDel, TargetDelimiter)
    ' Reset NOD Array Element Counter to -1, because NOD Array is 0-based.
    k = -1
    ' Loop through elements (digit groups) of NOD Array.
    For i = 0 To UBound(vnt)
        ' Check if current element has number of characters (digits)
        ' equal to NumberOfDigits.
        If Len(vnt(i)) = NumberOfDigits Then
           ' Count NOD Array Element i.e. prepare for write.
           k = k + 1
           
           ' Write i-th element of NOD Array to k-th element.
           ' Note: Data (Digit Groups) are possibly being overwritten.
           vnt(k) = vnt(i)
        End If
    Next
    ' Check if no Digit Group of size of NumberOfDigits was found.
    ' Exit if. NDigits = "".
    If k = -1 Then Exit Function
    ' Resize NOD Array to NOD Array Element Count, possibly smaller,
    ' due to fewer found Digit Groups with the size of NumberOfDigits.
    ReDim Preserve vnt(k)
    ' Join elements of NOD Array to Current Target String.
    strDel = Join(vnt, TargetDelimiter)
End If

' Write Current Target String to NDigits.
NDigits = strDel

End Function

Solution

  • Note: The RegExp pattern has been tested against the sample data provided. You might need to fine-tune it to match your specific data set accurately.

    Option Explicit
    Sub GetDigits()
        Dim objRegEx As Object, sRes As String
        Dim objMH As Object, rngData As Range
        Dim arrData, arrRes()
        Dim i As Long, j As Integer
        ' Create RegExp object
        Set objRegEx = CreateObject("vbscript.regexp")
        objRegEx.Pattern = "(^|[\s\+])\D*(\d+\-)*(\d{2,}(\.\d+)*)(?![A-Z])"
        ' Case-insensitive
        objRegEx.IgnoreCase = True
        objRegEx.Global = True
        ' Load source data
        Set rngData = Range("A1", Cells(Rows.Count, "A").End(xlUp))
        arrData = rngData.Value
        ReDim arrRes(1 To UBound(arrData), 1 To 1)
        ' Loop through data
        For i = LBound(arrData) To UBound(arrData)
            sRes = ""
            ' RegExp matching
            Set objMH = objRegEx.Execute(Trim(arrData(i, 1)))
            If objMH.Count > 0 Then
                ' Collect match result
                For j = 0 To objMH.Count - 1
                    sRes = sRes & Chr(32) & objMH(j).submatches(2)
                Next
                arrRes(i, 1) = Mid(sRes, 2)
            End If
        Next
        ' Write output to sheet
        With rngData.Offset(0, 1)
            .ClearContents
            .Value = arrRes
        End With
        Set objMH = Nothing
        Set objRegEx = Nothing
    End Sub
    
    

    enter image description here