Search code examples
c++character-encoding

C++ doesn't convert the uppercase "I" character to lowercase correctly


I have this simple C++ code that converts uppercase characters to lowercase:

#include <iostream>
#include <fstream>
#include <cwctype>
#include <locale>
#include <string>

int main()
{
    std::wstring input_str = L"İiIı";
    std::locale loc("tr_TR.UTF-8");
    std::wofstream output_file("lowercase_turkish.txt");
    output_file.imbue(loc);

    for (wchar_t& c : input_str) {
        c = std::towlower(c);
    }

    output_file << input_str << std::endl;
    output_file.close();

    return 0;
}

When giving the input İiIı I expect the output to be iiıı but rather I get the incorrect output İiiı

Why is that happening? and how can I solve the problem with minimum changes to the code considering that I use this code to convert uppercase letters to lowercase in more than 10 languages and it works well on all of them except Turkish.

I don't prefer a solution that is very specific to Turkish.


Solution

  • References from Wikipedia

    Dotless I = I, ı / U+0049, U+0131 / LATIN CAPITAL LETTER I, LATIN SMALL LETTER DOTLESS I

    Dotted İ = İ, i / U+0130, U+0069 / LATIN CAPITAL LETTER I WITH DOT ABOVE, LATIN SMALL LETTER I

    Latin I = I, i / U+0049(LATIN CAPITAL LETTER I), U+0069(LATIN SMALL LETTER I)

    Latin alphabet largely unaltered with the exception of extensions (such as diacritics), it is used to write English and other modern European languages.

    Check Dotted and dotless I in computing

    "İiIı" tolower using latin locale is "iiiı", any upper I,İ is lowered to i

    "İiIı" tolower using turkish locale is "iiıı", İ is lowered to i and I is lowered to ı


    Test Code - C++ 👍

    using ICU on Windows

    The code was compiled with Microsoft Visual C++ compiler.

    To use this code

    1. Install PowerShell 7.x
    2. Run the script "Compile.ps1". It downloads ICU Lib and compile the code.
    3. Run the script "Run.ps1". It runs the generated program in dist folder.

    You can clone/test/run the full source code from https://github.com/JomaStackOverflowAnswers/ToLowerTurkish

    #include <iostream>
    #include <string>
    #include <unicode/unistr.h>
    #include <unicode/ustream.h>
    #include <unicode/locid.h>
    #ifdef _WIN32
    #include <Windows.h>
    #endif
    
    
    using namespace std::string_literals;
    int main()
    {
        using namespace icu;
        #ifdef _WIN32
        SetConsoleOutputCP(CP_UTF8);
        #endif
    
        std::u16string data = u"İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131 - Default locale = "s;
        std::string data2 =  u8"İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131 - Custom Locale  = "s;
        
        UnicodeString localeName;
    
        UnicodeString uni_str(data.c_str(), data.length());
        uni_str.toLower();
        uni_str += Locale::getDefault().getDisplayName(localeName);
    
        UnicodeString uni_str2 = UnicodeString::fromUTF8(StringPiece(data2));
        Locale turkishLocale("tr", "TR");
        uni_str2.toLower(turkishLocale);
        uni_str2 += turkishLocale.getDisplayName(localeName);
        
    
        std::string str;
        uni_str.toUTF8String(str);
    
        std::string str2;
        uni_str2.toUTF8String(str2);
    
        std::cout << str << std::endl;
        std::cout << str2 << std::endl;
        
        return EXIT_SUCCESS;//0
    }
    

    Output

    i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı - default locale = English (United States)
    istanbul, diyarbakır, diyarbakır, türkiye 🌍 iiıı 🌍 iiıı - custom locale  = Turkish (Turkey)
    

    Screenshots C++ CODE

    Visual Studio Code

    vscode

    Windows Terminal

    wt


    Test Code - C# 👍

    You can test/check from https://replit.com/@JomaCorpFX/ToLowerTurkish#main.cs

    using System;
    using System.Globalization;
                        
    public class Program
    {
        public static void Main()
        {
        string data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131";
            CultureInfo culture = CultureInfo.CurrentCulture;
            Console.WriteLine($"System Culture {culture.Name}");
            Console.WriteLine(data.ToLower(culture));
    
            CultureInfo turkishCulture = new CultureInfo("tr-TR");
        Console.WriteLine($"Custom Culture {turkishCulture.Name}");
            Console.WriteLine(data.ToLower(turkishCulture));
        }
    }
    

    Output

    System Culture en-US
    istanbul, diyarbakır, diyarbakir, türkiye 🌍 iiiı 🌍 iiiı
    Custom Culture tr-TR
    istanbul, diyarbakır, diyarbakır, türkiye 🌍 iiıı 🌍 iiıı
    

    Test Code - Java 👍

    You can test/check from https://replit.com/@JomaCorpFX/ToLowerTurkish-1#Main.java

    import java.util.Locale;
    
    public class Main {
      public static void main(String[] args) {
        String data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131";
        Locale current = Locale.getDefault();
        System.out.println("Current Locale: " + current);
        System.out.println(data.toLowerCase(current));
    
        Locale turkishLocale = new Locale("tr", "TR");
        System.out.println("Custom Locale: " + turkishLocale);
        System.out.println(data.toLowerCase(turkishLocale));
      }
    }
    

    Output

    Current Locale: en_US
    i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı
    Custom Locale: tr_TR
    istanbul, diyarbakır, diyarbakır, türkiye 🌍 iiıı 🌍 iiıı
    

    Test Code - Python ❌

    WARNING. On Windows locale.setlocale(locale.LC_ALL, "tr_TR.UTF-8") can't change the locale it remains the same.

    import locale
    
    data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131"
    defaultLocale = locale.getdefaultlocale()
    print("Default Locale: " + str(defaultLocale))
    print(data.lower())
    
    turkishlocale = locale.setlocale(locale.LC_ALL, "tr_TR.UTF-8")
    print("Custom Locale: " + str(turkishlocale))
    print(data.lower())
    

    Output

    PS C:\Users\Megam\Downloads\icu4c-72_1-data-bin-b\TestIcu\ToLowerTurkish> python
    Python 3.9.13 (tags/v3.9.13:6de2ca5, May 17 2022, 16:36:42) [MSC v.1929 64 bit (AMD64)] on win32
    Type "help", "copyright", "credits" or "license" for more information.
    >>> import locale
    >>>
    >>> data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye �� İiIı �� \u0130\u0069\u0049\u0131"
    >>> defaultLocale = locale.getdefaultlocale()
    >>> print("Default Locale: " + str(defaultLocale))
    Default Locale: ('en_US', 'cp1252')
    >>> print(data.lower())
    i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı
    >>>
    >>> turkishlocale = locale.setlocale(locale.LC_ALL, "tr_TR.UTF-8")
    >>> print("Custom Locale: " + str(turkishlocale))
    Custom Locale: tr_TR.UTF-8
    >>> print(data.lower())
    i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı
    >>>