sql sql-server t-sql non-ascii-characters

SQL Server: How to find position of the first uppercase non-ascii character in the string?

Generally speaking I need to search varchar(100) column for the pattern '%bA%', where

A - uppercase non-ascii character and
b - lowercase non-ascii character.

From the high level perspective I need to find all strings where [space] symbol is missed before the uppercase character, for example as a result of firstname and lastname columns concatenation without a space between them.

SQLFiddle environment for reproducing

-- WORKING (ASCII) - thanx to @Serpiton
create table #tmp (value varchar(100));
insert into #tmp (value) values ('JohnnyBravo'); -- expected output
insert into #tmp (value) values ('Johnny Bravo');
insert into #tmp (value) values ('Johnnybravo');
insert into #tmp (value) values ('johnnybravo');
select * from #tmp WHERE value collate Latin1_General_CS_AI like '%[a-z][^ abcdefghijklmnopqrstuvwxyz]%'
drop table #tmp;

-- NOT WORKING (NON-ASCII, Cyrillic, win-1251)
create table #tmp (value varchar(100));
insert into #tmp (value) values ('АндрейМорозов'); -- expected output
insert into #tmp (value) values ('Андрей Морозов');
insert into #tmp (value) values ('Андрейморозов');
insert into #tmp (value) values ('андрейморозов');
select * from #tmp WHERE value collate Cyrillic_General_CS_AI like '%[а-я][^ абвгдежзиклмнопрстуфхцчшщъыьэюя]%'
drop table #tmp;

-- UNICODE - NOT WORKING TOO
create table #tmp (value nvarchar(100) collate Cyrillic_General_CS_AI);
insert into #tmp (value) values (N'АндрейМорозов'); -- expected output
insert into #tmp (value) values (N'Андрей Морозов');
insert into #tmp (value) values (N'Андрейморозов');
insert into #tmp (value) values (N'андрейморозов');
select * from #tmp WHERE value like '%[а-я][^ абвгдежзиклмнопрстуфхцчшщъыьэюя]%'
drop table #tmp;

Solution

If it really is just "firstname and lastname columns concatenation without a space between" then this may suffice:

select
    *
from mytable
where LEN(ConcatenatedName) <> LEN(REPLACE(ConcatenatedName, ' ', ''));

You can substitute whatever passes for a space in your local environment, of course. If there are, say, middle name which should be space-delimited this will pick them up too. However, you may get false positives if there are intended spaces e.g. two part surnames.

Second go

Fair point. I hadn't accounted for case. Here's some magic code which works with your test data:

with digits as
(
    SELECT
        *
    FROM
        (
        VALUES (0),(1), (2), (3), (4), (5), (6), (7), (8), (9)
        ) AS MyTable(i)
)
, Number as
(
    select (a.i * 10) + b.i as number
    from digits as a
    cross join digits as b
)
, LetterCase as
(
    select
        n.number
        ,t.value
        ,SUBSTRING(t.value, n.number, 1) as Letter
        ,ASCII(SUBSTRING(t.value, n.number, 1)) LetterASCII
        ,CASE
            when ASCII(SUBSTRING(t.value, n.number, 1)) between 65 and 90
                then 'True'
            else 'False'
        end as IsUpper
    from Number as n
    cross join #tmp as t
    where n.number between 1 and LEN(t.value)
)
select
    lc.value
from LetterCase as lc
where lc.IsUpper = 'True'
and lc.number > 1
and SUBSTRING(lc.value, lc.number - 1, 1) <> ' '

drop table #tmp;

It draws on my answer to this other question - Split words with a capital letter in sql.

Third go

Heres some magic code which works with your revised test data. You have to take care if your instance's (or database's or columns's) default collation is not the same as the one with which you want to work.

;with digits as
(
    SELECT
        *
    FROM
        (
        VALUES (0),(1), (2), (3), (4), (5), (6), (7), (8), (9)
        ) AS MyTable(i)
)
, Number as
(
    select (a.i * 10) + b.i as number
    from digits as a
    cross join digits as b
)
, UpperCaseCharacters as
(
    select NCHAR(1040) collate Cyrillic_General_CS_AI as CodePoint --А
    UNION ALL
    select NCHAR(1052) --М
    -- Extend this list with all the upper case character in your chosen glyph list.
)
, LetterCase as
(
    select
        n.number
        ,t.value
        ,CASE
            when SUBSTRING(t.value, n.number, 1) IN (select Codepoint from UpperCaseCharacters)
                then 'True'
            else ''
        end as IsUpper
    from Number as n
    cross join #tmp as t
    where n.number between 1 and LEN(t.value)
)
select
    lc.value
from LetterCase as lc
where lc.IsUpper = 'True'
and lc.number > 1
and SUBSTRING(lc.value, lc.number - 1, 1) <> ' ';