I have this simple C++ code that converts uppercase characters to lowercase:
#include <iostream>
#include <fstream>
#include <cwctype>
#include <locale>
#include <string>
int main()
{
std::wstring input_str = L"İiIı";
std::locale loc("tr_TR.UTF-8");
std::wofstream output_file("lowercase_turkish.txt");
output_file.imbue(loc);
for (wchar_t& c : input_str) {
c = std::towlower(c);
}
output_file << input_str << std::endl;
output_file.close();
return 0;
}
When giving the input İiIı
I expect the output to be iiıı
but rather I get the incorrect output İiiı
Why is that happening? and how can I solve the problem with minimum changes to the code considering that I use this code to convert uppercase letters to lowercase in more than 10 languages and it works well on all of them except Turkish.
I don't prefer a solution that is very specific to Turkish.
References from Wikipedia
Dotless I = I, ı / U+0049, U+0131 / LATIN CAPITAL LETTER I, LATIN SMALL LETTER DOTLESS I
Dotted İ = İ, i / U+0130, U+0069 / LATIN CAPITAL LETTER I WITH DOT ABOVE, LATIN SMALL LETTER I
Latin I = I, i / U+0049(LATIN CAPITAL LETTER I), U+0069(LATIN SMALL LETTER I)
Latin alphabet largely unaltered with the exception of extensions (such as diacritics), it is used to write English and other modern European languages.
Check Dotted and dotless I in computing
"İiIı" tolower using latin locale is "iiiı", any upper I,İ is lowered to i
"İiIı" tolower using turkish locale is "iiıı", İ is lowered to i and I is lowered to ı
Test Code - C++ 👍
using ICU on Windows
The code was compiled with Microsoft Visual C++ compiler.
To use this code
You can clone/test/run the full source code from https://github.com/JomaStackOverflowAnswers/ToLowerTurkish
#include <iostream>
#include <string>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <unicode/locid.h>
#ifdef _WIN32
#include <Windows.h>
#endif
using namespace std::string_literals;
int main()
{
using namespace icu;
#ifdef _WIN32
SetConsoleOutputCP(CP_UTF8);
#endif
std::u16string data = u"İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131 - Default locale = "s;
std::string data2 = u8"İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131 - Custom Locale = "s;
UnicodeString localeName;
UnicodeString uni_str(data.c_str(), data.length());
uni_str.toLower();
uni_str += Locale::getDefault().getDisplayName(localeName);
UnicodeString uni_str2 = UnicodeString::fromUTF8(StringPiece(data2));
Locale turkishLocale("tr", "TR");
uni_str2.toLower(turkishLocale);
uni_str2 += turkishLocale.getDisplayName(localeName);
std::string str;
uni_str.toUTF8String(str);
std::string str2;
uni_str2.toUTF8String(str2);
std::cout << str << std::endl;
std::cout << str2 << std::endl;
return EXIT_SUCCESS;//0
}
Output
i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı - default locale = English (United States)
istanbul, diyarbakır, diyarbakır, türkiye 🌍 iiıı 🌍 iiıı - custom locale = Turkish (Turkey)
Screenshots C++ CODE
Visual Studio Code
Windows Terminal
Test Code - C# 👍
You can test/check from https://replit.com/@JomaCorpFX/ToLowerTurkish#main.cs
using System;
using System.Globalization;
public class Program
{
public static void Main()
{
string data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131";
CultureInfo culture = CultureInfo.CurrentCulture;
Console.WriteLine($"System Culture {culture.Name}");
Console.WriteLine(data.ToLower(culture));
CultureInfo turkishCulture = new CultureInfo("tr-TR");
Console.WriteLine($"Custom Culture {turkishCulture.Name}");
Console.WriteLine(data.ToLower(turkishCulture));
}
}
Output
System Culture en-US
istanbul, diyarbakır, diyarbakir, türkiye 🌍 iiiı 🌍 iiiı
Custom Culture tr-TR
istanbul, diyarbakır, diyarbakır, türkiye 🌍 iiıı 🌍 iiıı
Test Code - Java 👍
You can test/check from https://replit.com/@JomaCorpFX/ToLowerTurkish-1#Main.java
import java.util.Locale;
public class Main {
public static void main(String[] args) {
String data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131";
Locale current = Locale.getDefault();
System.out.println("Current Locale: " + current);
System.out.println(data.toLowerCase(current));
Locale turkishLocale = new Locale("tr", "TR");
System.out.println("Custom Locale: " + turkishLocale);
System.out.println(data.toLowerCase(turkishLocale));
}
}
Output
Current Locale: en_US
i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı
Custom Locale: tr_TR
istanbul, diyarbakır, diyarbakır, türkiye 🌍 iiıı 🌍 iiıı
Test Code - Python ❌
WARNING. On Windows locale.setlocale(locale.LC_ALL, "tr_TR.UTF-8") can't change the locale it remains the same.
import locale
data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye 🌍 İiIı 🌍 \u0130\u0069\u0049\u0131"
defaultLocale = locale.getdefaultlocale()
print("Default Locale: " + str(defaultLocale))
print(data.lower())
turkishlocale = locale.setlocale(locale.LC_ALL, "tr_TR.UTF-8")
print("Custom Locale: " + str(turkishlocale))
print(data.lower())
Output
PS C:\Users\Megam\Downloads\icu4c-72_1-data-bin-b\TestIcu\ToLowerTurkish> python
Python 3.9.13 (tags/v3.9.13:6de2ca5, May 17 2022, 16:36:42) [MSC v.1929 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import locale
>>>
>>> data = "İstanbul, Diyarbakır, DİYARBAKIR, Türkiye �� İiIı �� \u0130\u0069\u0049\u0131"
>>> defaultLocale = locale.getdefaultlocale()
>>> print("Default Locale: " + str(defaultLocale))
Default Locale: ('en_US', 'cp1252')
>>> print(data.lower())
i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı
>>>
>>> turkishlocale = locale.setlocale(locale.LC_ALL, "tr_TR.UTF-8")
>>> print("Custom Locale: " + str(turkishlocale))
Custom Locale: tr_TR.UTF-8
>>> print(data.lower())
i̇stanbul, diyarbakır, di̇yarbakir, türkiye 🌍 i̇iiı 🌍 i̇iiı
>>>