df=pd.DataFrame({'A':['Bac 3','Bac 3','Bac 3'],
'B':['Bac 6','Bac 6','Bac 8'],
'A_len':[39577,39577,39577],
'B_len':[40449,40449,38091],
'A_prot_len':[500,550,800],
'B_prot_len':[450,300,600]})
A B A_len B_len A_prot_len B_prot_len
0 Bac 3 Bac 6 39577 40449 500 450
1 Bac 3 Bac 6 39577 40449 550 300
2 Bac 3 Bac 8 39577 38091 800 600
The dataframe has been sorted by A and B whereby their order of occurrence is linked. I've made a function to calculate distances between unique pairs of A and B but would like to present the result in a matrix.
def dist_species(df):
frames = []
unique_B = list(set(df['B']))
for species in unique_B:
df = df.copy()
A_B_pairs = df[df['B'] == species]
x = {'A':'first','B':'first', 'A_len': 'first', 'B_len': 'first', 'A_prot_len':'sum', 'B_prot_len': 'sum'}
A_B = A_B_pairs.groupby(['A','B']).agg(x).reset_index(drop=True)
A_B['Dist'] = 1-(A_B['A_prot_len'] + A_B['B_prot_len']) / (A_B['A_len'] + A_B['B_len'])
frames.append(A_B)
return pd.concat(frames)
dist_species(df)
A B A_len B_len A_prot_len B_prot_len Dist
0 Bac 3 Bac 6 39577 40449 1050 750 0.977507
0 Bac 3 Bac 8 39577 38091 800 600 0.981975
How can I structure my output to this:
| | Bac 3 | Bac 6 | Bac 8 |
| --------| --------|----------|----------|
| Bac 3 | 1 | 0.977507 | 0.981975 |
| Bac 6 | 0.977507| 1 | 0 |
| Bac 8 | 0.981975| 0 | 1 |
Really appreciate any guidance, thank you!
I first compute the distances for each couple in A,B
:
computed_distances = df.groupby(["A", "B"])\
.agg({'A_len': 'first', 'B_len': 'first', 'A_prot_len':'sum', 'B_prot_len': 'sum'})\
.apply(lambda x:1-(x.A_prot_len+x.B_prot_len)/(x.A_len+x.B_len), axis=1)
The resulting computed_distances
is a multi-index series:
A B
Bac 3 Bac 6 0.977507
Bac 8 0.981975
dtype: float64
I also define a variable bac
that contains all the Bacteria ID you have in df
:
bac = np.unique([*df.A]+[*df.B])
After that I build the distance dataframe in the following way:
#first create a square Dataframe with bacteria IDs as both index and colums
#I unstack the dataframe to produce a multi-index series (like computed_distances series)
dist_df = pd.DataFrame(index=bac, columns=bac).unstack()
#then I fill the series with the computed distances
dist_df.loc[computed_distances.index] = computed_distances.values #dist_df(i,j) = computed_distances(i,j)
dist_df.loc[computed_distances.swaplevel(0,1).index] = computed_distances.values #dist_df(j,i) = computed_distances(i,j)
#I set to 1 the distances between the same bacteria
dist_df.loc[dist_df.index.get_level_values(0)==dist_df.index.get_level_values(1)] = 1
#the remaining nan are the distances not computed
dist_df.fillna(0, inplace=True)
#finally return to the square dataframe shape
dist_df = dist_df.unstack()
The resulting dist_df
is:
Bac 3 Bac 6 Bac 8
Bac 3 1.000000 0.977507 0.981975
Bac 6 0.977507 1.000000 0.000000
Bac 8 0.981975 0.000000 1.000000