Search code examples
pythonrefactoringdrypython-2.5control-flow

Python trying to Refactor (DRY out) a long Control Flow


I am grabbing a lot of data from and SQL query that takes a long time to run. Since the SQL query takes so long to run, I am grabbing the data from the database in its most granular form. I then cycle through this data once and aggregate it in the forms that are useful to me.

My problem is that I am repeating myself over and over again. However, I am not sure of the best way to refactor this control flow. Thanks in advance!

def processClickOutData(cls, raw_data):
    singles = {}
    total={}
    absolute_total = 0
    channels = {}

    singles_true = {}
    total_true={}
    channels_true = {}
    absolute_total_true = 0

    list_channels = set([])
    list_tids = set([])


    total_position = {}
    total_position_true = {}
    tid_position = {}
    channel_position = {}
    channel_position_true = {}
    tid_position_true = {}

    for row in raw_data:
        gap=row[0]
        count=row[1]
        tid=row[2]
        prefered=row[3]
        channel=row[4]
        position=row[5]

        list_channels.add(channel)
        list_tids.add(tid)


        absolute_total += int(count)

        if total.has_key(gap):
            total[gap] += count
        else:
            total[gap] = count

        if singles.has_key(gap) and singles[gap].has_key(tid):
            singles[gap][tid] += count
        elif singles.has_key(gap):
            singles[gap][tid] = count
        else:
            singles[gap] = {}
            singles[gap][tid] = count

        if channels.has_key(gap) and channels[gap].has_key(channel):
            channels[gap][channel] += count
        elif channels.has_key(gap):
            channels[gap][channel] = count
        else:
            channels[gap] = {}
            channels[gap][channel] = count
        if total_position.has_key(position):
            total_position[position] += count
        else:
            total_position[position] = count
        if tid_position.has_key(position) and tid_position[position].has_key(tid):
            tid_position[position][tid] += count     
        elif tid_position.has_key(position):
            tid_position[position][tid] = count
        else:
            tid_position[position] = {}
            tid_position[position][tid] = count

        if channel_position.has_key(position) and channel_position[position].has_key(channel):
            channel_position[position][channel] += count     
        elif channel_position.has_key(position):
            channel_position[position][channel] = count
        else:
            channel_position[position] = {}
            channel_position[position][channel] = count

        if prefered == 0:
            absolute_total_true += count
            if total_true.has_key(gap):
                total_true[gap] += count
            else:
                total_true[gap] = count

            if singles_true.has_key(gap) and singles_true[gap].has_key(tid):
                singles_true[gap][tid] += count
            elif singles_true.has_key(gap):
                singles_true[gap][tid] = count
            else:
                singles_true[gap] = {}
                singles_true[gap][tid] = count

            if channels_true.has_key(gap) and channels_true[gap].has_key(channel):
               channels_true[gap][channel] += count
            elif channels_true.has_key(gap):
               channels_true[gap][channel] = count
            else:
               channels_true[gap] = {}
               channels_true[gap][channel] = count

            if total_position_true.has_key(position):
               total_position_true[position] += count
            else:
               total_position_true[position] = count 

            if tid_position_true.has_key(position) and tid_position_true[position].has_key(tid):
               tid_position_true[position][tid] += count     
            elif tid_position_true.has_key(position):
               tid_position_true[position][tid] = count
            else:
               tid_position_true[position] = {}
               tid_position_true[position][tid] = count

            if channel_position_true.has_key(position) and channel_position_true[position].has_key(channel):
               channel_position_true[position][channel] += count     
            elif channel_position_true.has_key(position):
               channel_position_true[position][channel] = count
            else:
               channel_position_true[position] = {}
               channel_position_true[position][channel] = count




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
                     "total_position" :  total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
    return final_values

Solution

  • The entire structure you're using to store the data is probably wrong, but since I don't know how you're using it, I can't help you with that.

    You can get rid of all of those has_key() calls by using collections.defaultdict. Note thedict.has_key(key) is deprecated anyway, you should just use key in thedict instead.

    Look at how I change the for loop too -- you can assign to names right in the for statement, no need to do it separately.

    from collections import defaultdict
    
    def processClickOutData(cls, raw_data):
        absolute_total = 0
        absolute_total_true = 0
    
        list_channels = set()
        list_tids = set()
    
        total = defaultdict(int)
        total_true = defaultdict(int)
        total_position = defaultdict(int)
        total_position_true = defaultdict(int)
    
        def defaultdict_int():
            return defaultdict(int)
    
        singles = defaultdict(defaultdict_int)
        singles_true = defaultdict(defaultdict_int)
        channels = defaultdict(defaultdict_int)
        channels_true = defaultdict(defaultdict_int)
        tid_position = defaultdict(defaultdict_int)
        tid_position_true = defaultdict(defaultdict_int)
        channel_position = defaultdict(defaultdict_int)
        channel_position_true = defaultdict(defaultdict_int)    
    
        for gap, count, prefered, channel, position in raw_data:
            list_channels.add(channel)
            list_tids.add(tid)
    
            absolute_total += count
            total[gap] += count
            singles[gap][tid] += count
            channels[gap][channel] += count
            total_position[position] += count
            tid_position[position][tid] += count
            channel_position[position][channel] += count
    
            if prefered == 0:
                absolute_total_true += count
                total_true[gap] += count
                singles_true[gap][tid] += count
                channels_true[gap][channel] += count
                total_position_true[position] += count
                tid_position_true[position][tid] += count
                channel_position_true[position][channel] += count
    
    
    
    
        final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
                         "total_position" :  total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
        return final_values
    

    What this does is automatically fill in the correct default values if the keys don't exist. You've got two kinds here. Where you're adding ints, you want to start with 0 if it doesn't exist -- that's what int returns, hence defaultdict(int). Where you're adding a dictionary that adds ints, you need to use a function that returns a defaultdict(int) which is what defaultdict_int does.

    Edit: Suggested alternate dictionary structure:

    position = defaultdict(lambda: defaultdict(defaultdict_int))
    gap = defaultdict(lambda: defaultdict(defaultdict_int))
    absolute_total = 0
    
    for gap, count, prefered, channel, position in raw_data:
        absolute_total += count
    
        posd = position[position]
        posd.setdefault('total', 0)
        posd['total'] += count
        posd['tid'][tid] += count
        posd['channel'][channel] += count
    
        gapd = gap[gap]
        gapd.setdefault('total', 0)
        gapd['total'] += count
        gapd['tid'][tid] += count
        gapd['channel'][channel] += count
    

    Do the same with the _true versions as well, and you've gone from 12 dicts to 4.