Given a row in an example table as:
Column A |
---|
abcccaaaaddeeaabbaa |
How can we get the following output using MYSQL?
sequence | occurrences |
---|---|
aaaa | 1 |
ccc | 1 |
dd | 1 |
ee | 1 |
aa | 2 |
bb | 1 |
a | 1 |
b | 1 |
Tried to compute this using MySQL. Cannot get the right logic for the same.
Edit: My latest attempt with BigQuery:
WITH Input AS (
SELECT 'abcccaaaaddeeaabbaa' AS str
),
Exploded AS (
SELECT
SUBSTR(str, pos, 1) AS char,
pos,
LAG(SUBSTR(str, pos, 1)) OVER (ORDER BY pos) AS prev_char
FROM Input,
UNNEST(GENERATE_ARRAY(1, LENGTH(str))) AS pos
),
Grouped AS (
SELECT
char,
pos,
IF(char = prev_char, 0, 1) AS new_group
FROM Exploded
),
Sequenced AS (
SELECT
char,
pos,
SUM(new_group) OVER (ORDER BY pos) AS sequence_id
FROM Grouped
),
Aggregated AS (
SELECT
sequence_id,
STRING_AGG(char, '' ORDER BY pos) AS sequence
FROM Sequenced
GROUP BY sequence_id
),
FinalCounts AS (
SELECT
sequence,
COUNT(*) AS occurrences
FROM Aggregated
GROUP BY sequence
)
select * from FinalCounts
Split it up into one row per character, then identify runs of characters and summarize them:
with recursive chars as (
select bar,
char_length(bar) idx,
substr(bar, -1) ch
from foo
union all
select bar, idx-1, substr(bar,idx-1,1)
from chars
where idx > 1
),
charruns as (
select ch, idx,
count(1) over (order by idx)
- count(1) over (partition by ch order by idx)
run_id
from chars
),
runs as (
select
repeat(ch, count(1)) sequence,
min(idx) idx
from charruns
group by ch, run_id
),
occurrences as (
select
sequence,
count(1) occurrences,
min(idx) idx
from runs
group by sequence
)
select sequence, occurrences
from occurrences
order by char_length(sequence) desc, idx