Search code examples
javascriptunicodeturkishpascalcasingtitle-case

Javascript convert unicode string to "Title Case"


I have a javascript case conversion problem which I cannot solve due to non-English letters. My main concern is the Turkish alphabet.

What I need to do is this:

  • hello world => Hello World
  • HELLO WORLD => Hello World
  • hELLO wOrLd => Hello World

Here is what I've accomplished so far:

String.prototype.turkishToUpper = function(){
    var stringlow = this;
    var letterslow = { 'i': 'İ', 'ş': 'Ş', 'ğ': 'Ğ', 'ü': 'Ü', 'ö': 'Ö', 'ç': 'Ç', 'ı': 'I' };
    stringlow = stringlow.replace(/(([iışğüçö]))/g, function(letterlow){ return letterslow[letterlow]; })
    return stringlow.toUpperCase();
}

String.prototype.turkishToLower = function(){
    var stringup = this;
    var lettersup = { 'İ': 'i', 'I': 'ı', 'Ş': 'ş', 'Ğ': 'ğ', 'Ü': 'ü', 'Ö': 'ö', 'Ç': 'ç' };
    stringup = stringup.replace(/(([İIŞĞÜÇÖ]))/g, function(letterup){ return lettersup[letterup]; })
    return stringup.toLowerCase();
}

String.prototype.toProperCase = function () {
    return this.replace(/\w\S*/g, function(txt){return txt.charAt(0).turkishToUpper() + txt.substr(1).turkishToLower();});
};

But this does not give me the correct results and I am suspecting the regex replace not being usable on unicode, but ascii.

When I test with Turkish characters, I get wrong results.

  • şeker becomes şEker instead of Şeker
  • çoban ırmak becomes çOban ıRmak intead of Çoban Irmak

Also, if this can ever get resolved, I need an icing on the cake to separate words not only by spaces, but also by some other stop characters such as : - = / etc so that

  • hello-world becomes Hello-World
  • hello:world becomes Hello:World

I've read through many similar questions here on SO, but no luck so far.

Thanks

Note: I think this is called Title Case but some have argued that it is Pascal Case. To be frank, I am interested in resolving the unicode issue (which I believe is the root cause) rather than semantics, so please forgive me if I've used wrong terminology :)


Solution

  • Standalone function:

    function toProperCase(s){
        return s.replace(/([^\s:\-])([^\s:\-]*)/g,function($0,$1,$2){
            return $1.toUpperCase()+$2.toLowerCase();
        });
    }
    

    Or for extending of String.prototype:

    String.prototype.toProperCase=function() {
        return this.replace(/([^\s:\-])([^\s:\-]*)/g,function($0,$1,$2){
            return $1.toUpperCase()+$2.toLowerCase();
        });
    }
    
    "çoban ırmak becomes çOban ıRmak intead of Çoban Irmak Hello-wOrld".toProperCase();
    // "Çoban Irmak Becomes Çoban Irmak Intead Of Çoban Irmak Hello-World"
    

    Update:

    Next code uses custom functionality for converting locale specific chars (tested partially). Code adds functions into String.prototype: toLocaleProperCase2, toLocaleLowerCase2 and toLocaleUpperCase2.

    (function(){
        // locale specific chars
        // IMPORTANT: name of locale must be always in lower case (for "tr-TR" locale - "tr-tr") !!!
        var localeInfos={
                "tr-tr": { lower: { i:"İ", ı:"I", ş:"Ş", ğ:"Ğ", ü:"Ü", ç:"Ç", ö:"Ö" },
                           upper: { İ:"i", I:"ı", Ş:"ş", Ğ:"ğ", Ü:"ü", Ç:"ç", Ö:"ö" } }
            },
            localeInfo;
        // helper vars
        var mask="\\s:\\-", // add additional delimeters chars to the mask if needed
            rg=new RegExp("([^"+mask+"])([^"+mask+"]*)","g");
        var fnToLocaleLower=function(s){ return localeInfo.upper[s]; },
            fnToLocaleUpper=function(s){ return localeInfo.lower[s]; },
            fnToProper=function($0,$1,$2){
                if(localeInfo){
                    if(localeInfo.lower.hasOwnProperty($1))$1=localeInfo.lower[$1];
                    $2=$2.replace(localeInfo.upperSearchRegExp,fnToLocaleLower);
                }
                return $1.toUpperCase()+$2.toLowerCase();
            };
        // helper calculations
        var localeInfosKeys=Object.keys(localeInfos);
        for(var i=0;localeInfo=localeInfos[localeInfosKeys[i]];i++){
            localeInfo.lowerSearchRegExp=new RegExp("["+Object.keys(localeInfo.lower).join("")+"]","g");
            localeInfo.upperSearchRegExp=new RegExp("["+Object.keys(localeInfo.upper).join("")+"]","g");
        }
    
        // extending String.prototype
        String.prototype.toLocaleProperCase2=function toLocaleProperCase2(locale){
            localeInfo=localeInfos[arguments.length?locale.toLowerCase():null];
            return this.replace(rg,fnToProper);
        };
        String.prototype.toLocaleLowerCase2=function toLocaleLowerCase2(locale){
            return ((localeInfo=localeInfos[arguments.length?locale.toLowerCase():null]) ?
                    this.replace(localeInfo.upperSearchRegExp,fnToLocaleLower):
                    this).toLowerCase();
        };
        String.prototype.toLocaleUpperCase2=function toLocaleUpperCase2(locale){
            return ((localeInfo=localeInfos[arguments.length?locale.toLowerCase():null]) ?
                    this.replace(localeInfo.lowerSearchRegExp,fnToLocaleUpper) :
                    this).toUpperCase();
        };
    })();
    

    // testing
    var sss="çoban ırmak ibecıoimes çOban ıRmak intead of Çoban IrImaİk Hello-wOrld";
    console.log("Origin:    ", sss);
    console.log("Proper TR: ", sss.toLocaleProperCase2("tr-TR"));
    console.log("Proper:    ", sss.toLocaleProperCase2());
    console.log("Lower TR:  ", sss.toLocaleLowerCase2("tr-TR"));
    console.log("Lower:     ", sss.toLocaleLowerCase2());
    console.log("Upper TR:  ", sss.toLocaleUpperCase2("tr-TR"));
    console.log("Upper:     ", sss.toLocaleUpperCase2());
    
    // Origin:    çoban ırmak ibecıoimes çOban ıRmak intead of Çoban IrImaİk Hello-wOrld
    // Proper TR: Çoban Irmak İbecıoimes Çoban Irmak İntead Of Çoban Irımaik Hello-World
    // Proper:    Çoban Irmak Ibecıoimes Çoban Irmak Intead Of Çoban Irimaik Hello-World
    // Lower TR:  çoban ırmak ibecıoimes çoban ırmak intead of çoban ırımaik hello-world
    // Lower:     çoban ırmak ibecıoimes çoban ırmak intead of çoban irimaik hello-world
    // Upper TR:  ÇOBAN IRMAK İBECIOİMES ÇOBAN IRMAK İNTEAD OF ÇOBAN IRIMAİK HELLO-WORLD
    // Upper:     ÇOBAN IRMAK IBECIOIMES ÇOBAN IRMAK INTEAD OF ÇOBAN IRIMAİK HELLO-WORLD