I am having A dataset of Three columns ID
, 'sort_seqand
level. basically i want to identify id wise level sequence sort by sort_seq. please suggest any optimal code other then
for loop` because it is taking longer time with dictionary and appending in list.
Input Dataset
import pandas as pd
import numpy as np
data = {'id': [1, 1, 1, 1,2, 2, 3, 3, 3, 3, 4, 5, 5, 6],
'sort_seq': [89, 24, 56, 8, 5, 64, 93, 88, 61, 31, 50, 75, 1, 81],
'level':['a', 'a', 'b', 'c', 'x', 'x', 'g', 'a', 'b', 'b', 'b', 'c', 'c','b']}
df = pd.DataFrame(data)
Expected Output
Tried Code
collect = []
for ij in df.id.unique():
idict = {}
x = df[df['id'] == ij]
x = x.sort_values(by='sort_seq',ascending=True)
x = x.reset_index()
idict[ij] = x['level'].tolist()
collect.append(idict)
collect
First sort by both columns by DataFrame.sort_values
, then grouping by consecutive values in level
with counter by Series.value_counts
, join number of counts by values of level
, so possible aggregate by join
:
np.random.seed(123)
data = {'id': [1, 1, 1, 1,2, 2, 3, 3, 3, 3, 4, 5, 5, 6],
'sort_seq': np.random.randint(0, 100, size=14),
'level':['a', 'a', 'b', 'c', 'x', 'x', 'g', 'a', 'b', 'b', 'b', 'c', 'c','b']}
df = pd.DataFrame(data)
df1 = df.sort_values(['id','sort_seq'])
df1 = (df1.groupby(['id', df1['level'].ne(df1['level'].shift()).cumsum()])['level']
.value_counts()
.droplevel(1)
.rename('col')
.reset_index()
.assign(level=lambda x: x['col'].astype(str) + x['level'])
.groupby('id')['level'].agg(','.join)
.reset_index(name='PATTERN')
)
print(df1)
id PATTERN
0 1 1c,2a,1b
1 2 2x
2 3 1b,1g,1b,1a
3 4 1b
4 5 2c
5 6 1b
Or use itertools.groupby
with Counter
in comprehension:
import itertools
from collections import Counter
f = lambda x: ','.join(f'{b}{a}'
for _, g in itertools.groupby(x)
for a, b in Counter(g).items())
df1 = (df.sort_values(['id','sort_seq'])
.groupby('id')['level']
.agg(f)
.reset_index(name='PATTERN'))
print(df1)
id PATTERN
0 1 1c,2a,1b
1 2 2x
2 3 1b,1g,1b,1a
3 4 1b
4 5 2c
5 6 1b