Search code examples
c++stringunicode-stringnon-ascii-characterswchar-t

Declaring a std::string after Unicode to ASCII conversion is giving Segmentation fault


I am trying to take a wchar_t string from stdin and then convert it from unicode to ASCII through a function.

The function is somehow not allowing me to use std::string further in the program.

#include <iostream>
#include <string>
#include <locale>
#include <cstring>
#include <cwchar>
using namespace std;
bool UnicodeToAscii(wchar_t* szUnicode, char* szAscii);
int main()
{
    wchar_t w[100];
    wcin>>w;
    char* c;
    bool x=UnicodeToAscii(w,c);
    cout<<c<<"\n";
    string s="hi";
    return 0;
}
bool UnicodeToAscii(wchar_t* szUnicode, char* szAscii)
{
    int len, i;
    if((szUnicode == NULL) || (szAscii == NULL))
        return false;
    len = wcslen(szUnicode);
    for(i=0;i<len+1;i++)
        *szAscii++ = static_cast<char>(*szUnicode++);
    return true;
}

Solution

  • You are not allocating any memory for c, so you are writing character data to random memory and corrupting your program.

    You should stop using character arrays and raw pointers, and start using std::string and std::wstring instead. Let them manage memory for you.

    Try this:

    #include <iostream>
    #include <string>
    
    void UnicodeToAscii(const std::wstring &szUnicode, std::string &szAscii);
    
    int main()
    {
        std::wstring w;
        std::wcin >> w; // or std::getline(wcin, w);
    
        std::string c;
        bool x = UnicodeToAscii(w, c);
        std::cout << c << "\n";
    
        std::string s = "hi";
        return 0;
    }
    
    void UnicodeToAscii(const std::wstring &szUnicode, std::string &szAscii)
    {
        szAscii.clear(len);
    
        int len = szUnicode.length();
        char c;
    
        szAscii.reserve(len);
    
        for(int i = 0; i < len; ++i)
        {
            wchar_t w = szUnicode[i];
    
            if ((w >= 0) && (w < 127))
            {
                // ASCII character
                c = static_cast<char>(w);
            }
            else
            {
                // non-ASCII character
                c = '?';
    
                // wchar_t is 2 bytes (UTF-16) on some systems,
                // but is 4 bytes (UTF-32) on other systems...
                #if sizeof(wchar_t) == 2
                if ((w >= 0xD800) && (w <= 0xDFFF))
                {
                    // skip first unit of a surrogate pair,
                    // the loop will skip the second unit...
                    ++i;
                }
                #endif
            }
    
            szAscii.push_back(c);
        }
    
        return true;
    }
    

    Of course, this is very rudimentary, and it only handles true ASCII characters (0x00 - 0x7F). Handling Unicode correctly is much more complex than this. But this answers your immediate question about why you cannot use std::string after calling your function - because you are trashing memory.