python refactoring dry python-2.5 control-flow

Python trying to Refactor (DRY out) a long Control Flow

I am grabbing a lot of data from and SQL query that takes a long time to run. Since the SQL query takes so long to run, I am grabbing the data from the database in its most granular form. I then cycle through this data once and aggregate it in the forms that are useful to me.

My problem is that I am repeating myself over and over again. However, I am not sure of the best way to refactor this control flow. Thanks in advance!

def processClickOutData(cls, raw_data):
    singles = {}
    total={}
    absolute_total = 0
    channels = {}

    singles_true = {}
    total_true={}
    channels_true = {}
    absolute_total_true = 0

    list_channels = set([])
    list_tids = set([])


    total_position = {}
    total_position_true = {}
    tid_position = {}
    channel_position = {}
    channel_position_true = {}
    tid_position_true = {}

    for row in raw_data:
        gap=row[0]
        count=row[1]
        tid=row[2]
        prefered=row[3]
        channel=row[4]
        position=row[5]

        list_channels.add(channel)
        list_tids.add(tid)


        absolute_total += int(count)

        if total.has_key(gap):
            total[gap] += count
        else:
            total[gap] = count

        if singles.has_key(gap) and singles[gap].has_key(tid):
            singles[gap][tid] += count
        elif singles.has_key(gap):
            singles[gap][tid] = count
        else:
            singles[gap] = {}
            singles[gap][tid] = count

        if channels.has_key(gap) and channels[gap].has_key(channel):
            channels[gap][channel] += count
        elif channels.has_key(gap):
            channels[gap][channel] = count
        else:
            channels[gap] = {}
            channels[gap][channel] = count
        if total_position.has_key(position):
            total_position[position] += count
        else:
            total_position[position] = count
        if tid_position.has_key(position) and tid_position[position].has_key(tid):
            tid_position[position][tid] += count     
        elif tid_position.has_key(position):
            tid_position[position][tid] = count
        else:
            tid_position[position] = {}
            tid_position[position][tid] = count

        if channel_position.has_key(position) and channel_position[position].has_key(channel):
            channel_position[position][channel] += count     
        elif channel_position.has_key(position):
            channel_position[position][channel] = count
        else:
            channel_position[position] = {}
            channel_position[position][channel] = count

        if prefered == 0:
            absolute_total_true += count
            if total_true.has_key(gap):
                total_true[gap] += count
            else:
                total_true[gap] = count

            if singles_true.has_key(gap) and singles_true[gap].has_key(tid):
                singles_true[gap][tid] += count
            elif singles_true.has_key(gap):
                singles_true[gap][tid] = count
            else:
                singles_true[gap] = {}
                singles_true[gap][tid] = count

            if channels_true.has_key(gap) and channels_true[gap].has_key(channel):
               channels_true[gap][channel] += count
            elif channels_true.has_key(gap):
               channels_true[gap][channel] = count
            else:
               channels_true[gap] = {}
               channels_true[gap][channel] = count

            if total_position_true.has_key(position):
               total_position_true[position] += count
            else:
               total_position_true[position] = count 

            if tid_position_true.has_key(position) and tid_position_true[position].has_key(tid):
               tid_position_true[position][tid] += count     
            elif tid_position_true.has_key(position):
               tid_position_true[position][tid] = count
            else:
               tid_position_true[position] = {}
               tid_position_true[position][tid] = count

            if channel_position_true.has_key(position) and channel_position_true[position].has_key(channel):
               channel_position_true[position][channel] += count     
            elif channel_position_true.has_key(position):
               channel_position_true[position][channel] = count
            else:
               channel_position_true[position] = {}
               channel_position_true[position][channel] = count




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
                     "total_position" :  total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
    return final_values

Solution

The entire structure you're using to store the data is probably wrong, but since I don't know how you're using it, I can't help you with that.

You can get rid of all of those has_key() calls by using collections.defaultdict. Note thedict.has_key(key) is deprecated anyway, you should just use key in thedict instead.

Look at how I change the for loop too -- you can assign to names right in the for statement, no need to do it separately.

from collections import defaultdict

def processClickOutData(cls, raw_data):
    absolute_total = 0
    absolute_total_true = 0

    list_channels = set()
    list_tids = set()

    total = defaultdict(int)
    total_true = defaultdict(int)
    total_position = defaultdict(int)
    total_position_true = defaultdict(int)

    def defaultdict_int():
        return defaultdict(int)

    singles = defaultdict(defaultdict_int)
    singles_true = defaultdict(defaultdict_int)
    channels = defaultdict(defaultdict_int)
    channels_true = defaultdict(defaultdict_int)
    tid_position = defaultdict(defaultdict_int)
    tid_position_true = defaultdict(defaultdict_int)
    channel_position = defaultdict(defaultdict_int)
    channel_position_true = defaultdict(defaultdict_int)    

    for gap, count, prefered, channel, position in raw_data:
        list_channels.add(channel)
        list_tids.add(tid)

        absolute_total += count
        total[gap] += count
        singles[gap][tid] += count
        channels[gap][channel] += count
        total_position[position] += count
        tid_position[position][tid] += count
        channel_position[position][channel] += count

        if prefered == 0:
            absolute_total_true += count
            total_true[gap] += count
            singles_true[gap][tid] += count
            channels_true[gap][channel] += count
            total_position_true[position] += count
            tid_position_true[position][tid] += count
            channel_position_true[position][channel] += count




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
                     "total_position" :  total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
    return final_values

What this does is automatically fill in the correct default values if the keys don't exist. You've got two kinds here. Where you're adding ints, you want to start with 0 if it doesn't exist -- that's what int returns, hence defaultdict(int). Where you're adding a dictionary that adds ints, you need to use a function that returns a defaultdict(int) which is what defaultdict_int does.

Edit: Suggested alternate dictionary structure:

position = defaultdict(lambda: defaultdict(defaultdict_int))
gap = defaultdict(lambda: defaultdict(defaultdict_int))
absolute_total = 0

for gap, count, prefered, channel, position in raw_data:
    absolute_total += count

    posd = position[position]
    posd.setdefault('total', 0)
    posd['total'] += count
    posd['tid'][tid] += count
    posd['channel'][channel] += count

    gapd = gap[gap]
    gapd.setdefault('total', 0)
    gapd['total'] += count
    gapd['tid'][tid] += count
    gapd['channel'][channel] += count

Do the same with the _true versions as well, and you've gone from 12 dicts to 4.