Search code examples
stringdelphireplacedelphi-run-time-library

Remove non-numeric characters from string in Delphi


I have these three functions that successfully remove all non-numeric characters from a given string:

The first function loops through the characters of the input string, and if the current character is a number, it adds it to a new string that is returned as the result of the function.

  function RemoveNonNumericChars(const s: string): string;
  begin
    Result := '';
    for var i := 1 to Length(s) do
    begin
      if s[i] in ['0'..'9'] then
        Result := Result + s[i];
    end;
  end;

The second function loops through the characters of the input string from right to left, and if the current character is not a number, it uses the Delete function to remove it from the string

  function RemoveNonNumericChars(const s: string): string;
  begin
    Result := s;
    for var i := Length(Result) downto 1 do
    begin
      if not(Result[i] in ['0'..'9']) then
        Delete(Result, i, 1);
    end;
  end;

The third function uses a regular expression to replace all non-numeric characters with nothing, thus removing them. TRegEx is from the System.RegularExpressions unit.

  function RemoveNonNumericChars(const s: string): string;
  begin
    var RegEx := TRegEx.Create('[^0-9]');
    Result := RegEx.Replace(s, '');
  end;

All three of them do what I need, but I want to know if there is maybe a built-in function in Delphi for this... Or maybe even a better way to do it than the way I'm doing it. What's the best and/or fastest way to remove non-numeric characters from a string in Delphi?


Solution

  • Both your approaches are slow because you constantly change the length of the string. Also, they only recognise Arabic digits.

    To solve the performance issue, preallocate the maximum result length:

    function RemoveNonDigits(const S: string): string;
    begin
      SetLength(Result, S.Length);
      var LActualLength := 0;
      for var i := 1 to S.Length do
        if CharInSet(S[i],  ['0'..'9']) then
        begin
          Inc(LActualLength);
          Result[LActualLength] := S[i];
        end;
      SetLength(Result, LActualLength);
    end;
    

    To support non-Arabic digits, use the TCharacter.IsDigit function:

    function RemoveNonDigits(const S: string): string;
    begin
      SetLength(Result, S.Length);
      var LActualLength := 0;
      for var i := 1 to S.Length do
        if S[i].IsDigit then
        begin
          Inc(LActualLength);
          Result[LActualLength] := S[i];
        end;
      SetLength(Result, LActualLength);
    end;
    

    To optimise even further, as suggested by Stefan Glienke, you can bypass the RTL's string handling machinery and write each character directly with some loss of code readability:

    function RemoveNonDigits(const S: string): string;
    begin
      SetLength(Result, S.Length);
      var ResChr := PChar(Result);
      var LActualLength := 0;
      for var i := 1 to S.Length do
        if CharInSet(S[i],  ['0'..'9']) then
        begin
          Inc(LActualLength);
          ResChr^ := S[i];
          Inc(ResChr);
        end;
      SetLength(Result, LActualLength);
    end;
    

    Benchmark

    Just for fun I did a very primitive benchmark on random input strings with length < 100 and about 24% chance of a char being a digit:

    program Benchmark;
    
    {$APPTYPE CONSOLE}
    
    {$R *.res}
    
    uses
      System.SysUtils, System.RegularExpressions, Windows;
    
    function OP1(const s: string): string;
    begin
      Result := '';
      for var i := 1 to Length(s) do
      begin
        if s[i] in ['0'..'9'] then
          Result := Result + s[i];
      end;
    end;
    
    function OP2(const s: string): string;
    begin
      Result := s;
      for var i := Length(Result) downto 1 do
      begin
        if not(Result[i] in ['0'..'9']) then
          Delete(Result, i, 1);
      end;
    end;
    
    function OP3(const s: string): string;
    begin
      var RegEx := TRegEx.Create('[^0-9]');
      Result := RegEx.Replace(s, '');
    end;
    
    function AR1(const S: string): string;
    begin
      SetLength(Result, S.Length);
      var LActualLength := 0;
      for var i := 1 to S.Length do
        if CharInSet(S[i],  ['0'..'9']) then
        begin
          Inc(LActualLength);
          Result[LActualLength] := S[i];
        end;
      SetLength(Result, LActualLength);
    end;
    
    function AR2(const S: string): string;
    begin
      SetLength(Result, S.Length);
      var ResChr := PChar(Result);
      var LActualLength := 0;
      for var i := 1 to S.Length do
        if CharInSet(S[i],  ['0'..'9']) then
        begin
          Inc(LActualLength);
          ResChr^ := S[i];
          Inc(ResChr);
        end;
      SetLength(Result, LActualLength);
    end;
    
    function AR3(const S: string): string;
    begin
      SetLength(Result, S.Length);
      var ResChr := PChar(Result);
      for var i := 1 to S.Length do
        if CharInSet(S[i],  ['0'..'9']) then
        begin
          ResChr^ := S[i];
          Inc(ResChr);
        end;
      SetLength(Result, ResChr - PChar(Result));
    end;
    
    function RandomInputString: string;
    begin
      SetLength(Result, Random(100));
      for var i := 1 to Result.Length do
        Result[i] := Chr(Ord('0') + Random(42));
    end;
    
    begin
    
      Randomize;
    
      const N = 1000000;
    
      var Data := TArray<string>(nil);
      SetLength(Data, N);
      for var i := 0 to N - 1 do
        Data[i] := RandomInputString;
    
      var f, c0, cOP1, cOP2, cOP3, cAR1, cAR2, cAR3: Int64;
    
      QueryPerformanceFrequency(f);
    
      QueryPerformanceCounter(c0);
      for var i := 0 to High(Data) do
        OP1(Data[i]);
      QueryPerformanceCounter(cOP1);
      Dec(cOP1, c0);
    
      QueryPerformanceCounter(c0);
      for var i := 0 to High(Data) do
        OP2(Data[i]);
      QueryPerformanceCounter(cOP2);
      Dec(cOP2, c0);
    
      QueryPerformanceCounter(c0);
      for var i := 0 to High(Data) do
        OP3(Data[i]);
      QueryPerformanceCounter(cOP3);
      Dec(cOP3, c0);
    
      QueryPerformanceCounter(c0);
      for var i := 0 to High(Data) do
        AR1(Data[i]);
      QueryPerformanceCounter(cAR1);
      Dec(cAR1, c0);
    
      QueryPerformanceCounter(c0);
      for var i := 0 to High(Data) do
        AR2(Data[i]);
      QueryPerformanceCounter(cAR2);
      Dec(cAR2, c0);
    
      QueryPerformanceCounter(c0);
      for var i := 0 to High(Data) do
        AR3(Data[i]);
      QueryPerformanceCounter(cAR3);
      Dec(cAR3, c0);
    
      Writeln('Computations per second:');
      Writeln('OP1: ', Round(N / (cOP1 / f)));
      Writeln('OP2: ', Round(N / (cOP2 / f)));
      Writeln('OP3: ', Round(N / (cOP3 / f)));
      Writeln('AR1: ', Round(N / (cAR1 / f)));
      Writeln('AR2: ', Round(N / (cAR2 / f)));
      Writeln('AR3: ', Round(N / (cAR3 / f)));
    
      Readln;
    
    end.
    

    Result:

    Computations per second:
    OP1: 1398134
    OP2: 875116
    OP3: 39162
    AR1: 3406172
    AR2: 4063260
    AR3: 4032343
    

    Bar chart with the results.

    As you can see, in this test at least, regular expressions are by far the slowest approach. And preallocating makes a major difference, while avoiding the _UniqueStringU issue appears to make only a relatively minor improvement.

    But even with the very slow RegEx approach, you can do 40 000 calls per second. On my 13-year-old computer.