Search code examples
sql-servert-sqlvbafrequency-analysis

SQL - Determine the most frequently occuring words within a column


Is there an easy way of determining the most frequently occuring word in a column/field using T-SQL or VBA?

I am working on a fuzzy matching system for two given recordsets and would like to produce a matching string where the most frequently occuring words are removed. As the data is from a customer relations management database terms like "limited", "ltd", "plc" and "CORPORATION" would be removed.


Solution

  • Written for sql-server 2005+

    Function to split:

    create function f_split
    (
      @a varchar(max), 
      @delimiter varchar(20)
    )
    RETURNS @t TABLE(substr varchar(200))
    as
    begin
    set @a = @a + @delimiter
    ;with a as
    (
      select cast(1 as bigint) f1, charindex(@delimiter, @a) f2
      where len(@a) > 0
      union all
      select f2 + (len(@delimiter)) + 1, charindex(@delimiter, @a, f2+1)
      from a
      where f2 > 0
    )
    insert @t
    select substring(@a, f1, f2 - f1) from a
    where f1 < f2
    return
    end
    go
    

    Query:

    --testdata
    declare @table table(name varchar(50))
    
    insert @table values('bla bla bla ltd')
    insert @table values('bla plc ltd')
    insert @table values('more text CORPORATION')
    
    
    declare @matchlist table(name varchar(50), replacement varchar(50))
    insert @matchlist values('ltd', 'limited')
    insert @matchlist values('plc', 'limited')
    insert @matchlist values('CORPORATION', 'limited')
    
    --query
    select coalesce(m.replacement, a.substr) name, count(*) count from @table p
    cross apply
    (
      select substr from 
      dbo.f_split(p.name, ' ')
    ) a
    left join
    @matchlist m
    on a.substr = m.name
    group by coalesce(m.replacement, a.substr)
    order by 2 desc
    

    Result:

    name  count
    ----  -----
    bla       4
    limited   4
    more      1
    text      1