Search code examples
sqlmysql

Count of repeating character patterns in a string using MySQL


Given a row in an example table as:

Column A
abcccaaaaddeeaabbaa

How can we get the following output using MYSQL?

sequence occurrences
aaaa 1
ccc 1
dd 1
ee 1
aa 2
bb 1
a 1
b 1

Tried to compute this using MySQL. Cannot get the right logic for the same.

Edit: My latest attempt with BigQuery:

WITH Input AS (
  SELECT 'abcccaaaaddeeaabbaa' AS str
),

Exploded AS (
  SELECT 
    SUBSTR(str, pos, 1) AS char,
    pos,
    LAG(SUBSTR(str, pos, 1)) OVER (ORDER BY pos) AS prev_char
  FROM Input, 
  UNNEST(GENERATE_ARRAY(1, LENGTH(str))) AS pos
),

Grouped AS (
  SELECT
    char,
    pos,
    IF(char = prev_char, 0, 1) AS new_group
  FROM Exploded
),

Sequenced AS (
  SELECT
    char,
    pos,
    SUM(new_group) OVER (ORDER BY pos) AS sequence_id
  FROM Grouped
),

Aggregated AS (
  SELECT
    sequence_id,
    STRING_AGG(char, '' ORDER BY pos) AS sequence
  FROM Sequenced
  GROUP BY sequence_id
),

FinalCounts AS (
  SELECT
    sequence,
    COUNT(*) AS occurrences
  FROM Aggregated
  GROUP BY sequence
)
select * from FinalCounts

Solution

  • Split it up into one row per character, then identify runs of characters and summarize them:

    with recursive chars as (
      select bar,
        char_length(bar) idx,
        substr(bar, -1) ch
      from foo
    
      union all
    
      select bar, idx-1, substr(bar,idx-1,1)
      from chars
      where idx > 1
    ),
    charruns as (
      select ch, idx,
        count(1) over (order by idx)
          - count(1) over (partition by ch order by idx)
          run_id
      from chars
    ),
    runs as (
      select
        repeat(ch, count(1)) sequence,
        min(idx) idx
      from charruns
      group by ch, run_id
    ),
    occurrences as (
      select
        sequence,
        count(1) occurrences,
        min(idx) idx
      from runs
      group by sequence
    )
    select sequence, occurrences
    from occurrences
    order by char_length(sequence) desc, idx
    

    fiddle