Search code examples
c#soundex

Output of Soundex algorithm implementation is wrong for cases - "Tymczak" and "Pfister"


When I tested the algorithm Soundex according to the Wikipedia article on Soundex, I found that Tymczak returned T520, not T522 and Pfister returned P123, not P236.

I have no idea why the output is not correct.

My code:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace ConsoleApplication4
{
   class Program
   {
     static void Main(string[] args)
     {
        string s = "Tymczak";
        string result = SoundexByWord(s);
        Console.WriteLine(result);
    }

    private static string Soundex(string data)
    {
        string first = "pv";
        string second = "xz";
        string third = "dt";
        string forth = "mn";
        string fifth = "bf";
        string sixth = "cgj";
        string seventh = "kqs";
        //ana ast5dtmt string builder 34an 3aml zy al list fy apend 
        // 34an a apend mn al a5r al string 

        StringBuilder result = new StringBuilder();
        if (data != null && data.Length > 0)
        {
            string previousCode = "", currentCode = "", currentLetter = "";

            // append on the string from the last 
            // get the first characheter of the string data 
            // append it on the result
            //according to algorithm first charachter stays the same 
            result.Append(data.Substring(0, 1));
            RemoveUnwantedChar(data);
            for (int i = 1; i < data.Length; i++)
            {
                // nb2d al algorithm first take the second characheter in data 
                //n7wlo la lower  
                currentLetter = data.Substring(i, 1).ToLower();
                currentCode = "";
                // No string for zero because we will remove it 
                if (first.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "1";

                else if (fifth.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "1";

                else if (sixth.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "2";
                else if (seventh.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "2";
                else if (second.IndexOf(currentLetter) > -1)
                    currentCode = "2";
                else if (third.IndexOf(currentLetter) > -1)
                    currentCode = "3";
                else if (currentLetter == "l")
                    currentCode = "4";
                else if (forth.IndexOf(currentLetter) > -1)
                    currentCode = "5";
                else if (currentLetter == "r")
                    currentCode = "6";
                if (currentCode != previousCode)
                    result.Append(currentCode);

                if (result.Length == 4) break;

                if (currentCode != "")
                    previousCode = currentCode;
            }
        }

        if (result.Length < 4)
            result.Append(new String('0', 4 - result.Length));

        return result.ToString().ToUpper();
    }

    public static string RemoveUnwantedChar(string input)
    {
        return Regex.Replace(input, "aeiouyhw", "");
    }
    private static string SoundexByWord(string data)
    {
        var soundexes = new List<string>();
        foreach (var str in data.Split(' '))
        {
            soundexes.Add(Soundex(str));
        }

        #if Net35OrLower
       // string.Join in .Net 3.5 and 
        //before require the second parameter to be an array.
        return string.Join(" ", soundexes.ToArray());
        #endif
        // string.Join in .Net 4 has an overload 
        //that takes IEnumerable<string>
        return string.Join(" ", soundexes);
    }
  }
}

Solution

  • This doesn't tell you where your code goes wrong and probably isn't even the fastest solution, but it seems to get the examples right and is only a few lines of code..

    It implements the six steps of the second version of the algorithm.

    string Soundex(string input)
    {
        // character groups: the 1st one are vowels to remove
        // the other groups are characters to replace by the group index
        List<string> groups = new List<string>() 
                             { "aeiouy", "bfpv", "cgjkqsxz", "dt", "l", "mn", "r" };
    
        // save the 1st character (1)
        string first = input.Substring(0, 1);
        string s = input.ToLower();
    
        // remove unconditionally (1)
        s = s.Replace("h", "").Replace("w", "");
    
        // replace characters in all replacement groups (2)
        for (int g = 1; g < groups.Count; g++)
            for (int i = 0; i < groups[g].Length; i++)
                s = s.Replace(groups[g][i], ((char)(g + (byte)'0')));
    
        // replace repeating digits (3)
        // NOTE: this step actually should be repeated until the length no longer changes!!!
        for (int i = 1; i < 10; i++) s = s.Replace(i + "" + i, i + "");
    
        // now remove characters from group 0: (4)
        for (int i = 0; i < groups[0].Length; i++)  s = s.Replace(groups[0][i].ToString(), "");
    
        // remove the first if it is a digit (5)
        if ( (s[0] >= '0') && (s[0] <= '9') ) s = s.Substring(1);
    
        // add saved first to max 3 digits and pad if needed (6)
        return (first + s.Substring(0, Math.Min(3, s.Length))).PadRight(4, '0');
    }