Pandas, for each groupby group, enumerate over column of strings and convert to counter dictionary

I'm trying to automate building a networkx graph for any input pandas dataframe.

The dataframe looks like this:

  FeatureID       BC         chrom       pos        ftm_call
  1_1_1           GCTATT     12          25398138   NRAS_3
  1_1_1           GCCTAT     12          25398160   NRAS_3
  1_1_1           GCCTAT     12          25398073   NRAS_3
  1_1_1           GATCCT     12          25398128   NRAS_3
  1_1_1           GATCCT     12          25398107   NRAS_3

Here's the algorithm I need to sort out:

Group by FeatureID
For each FeatureID, select graph with "name" attribute that matches ftm_call
For each row in group, enumerate over the BC column where starting position equals the value in the pos column
For every letter in BC, check if that letter is already found in the graph at that position, and if not, add with weight of 1. If already there, add 1 to weight

So far, here is what I have:

import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict

# read in test basecalls
hamming_df = pd.read_csv("./test_data.txt", sep="\t")
hamming_df = hamming_df[["FeatureID", "BC", "chrom", "pos"]]

# initiate graphs 
G = nx.DiGraph(name="G")
KRAS = nx.DiGraph(name="KRAS")
NRAS_3 = nx.DiGraph(name="NRAS_3")

# list of reference graphs
ref_graph_list = [G, KRAS, NRAS_3]

def add_basecalls(row):
    basecall = row.BC.astype(str)
    target = row.name[1]
    pos = row["pos"]
    chrom = row["chrom"]

    # initialize counter dictionary
    d = defaultdict()

    # select graph that matches ftm call
    graph = [f for f in ref_graph_list if f.graph["name"] == target]

stuff = hamming_df.groupby(["FeatureID", "ftm_call"])  
stuff.apply(add_basecalls)

But this isn't pulling out the barcodes as strings that I can just enumerate across, it's pulling them out as a series and I'm stuck.

Desired output is a graph containing the following information, example shown for the first BC "GCTATT" with fictitious counts:

FeatureID    chrom    pos         Nucleotide    Weight
1_1_1        12       25398138       G            10
1_1_1        12       25398138       C            22
1_1_1        12       25398139       T            12
1_1_1        12       25398140       A            15
1_1_1        12       25398141       T            18
1_1_1        12       25398142       T            22

Thanks in advance!

Solution

You probably need an additional apply with axis=1 to parse the rows for each group:

import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict

# initiate graphs
GRAPHS = {"G": nx.DiGraph(name="G"),
          "KRAS": nx.DiGraph(name="KRAS"),
          "NRAS_3": nx.DiGraph(name="NRAS_3"), # notice that test_data.txt has "NRAS_3" not "KRAS_3"
     }

WEIGHT_DICT = defaultdict()

def update_weight_for_row(row, target_graph):
    pos = row["pos"]
    chrom = row["chrom"]
    for letter in row.BC:
        print(letter)
        # now you have access to letters in BC per row
        # and can update graph weights as desired

def add_basecalls(grp):
    # select graph that matches ftm_call
    target = grp.name[1]
    target_graph = GRAPHS[target]
    grp.apply(lambda row: update_weight_for_row(row, target_graph), axis=1)

# read in test basecalls
hamming_df = pd.read_csv("./test_data.txt", sep="\t")
hamming_df2 = hamming_df[["FeatureID", "BC", "chrom", "pos"]]  # Why is this line needed?
stuff = hamming_df.groupby(["FeatureID", "ftm_call"])  
stuff.apply(lambda grp: add_basecalls(grp))