I am grabbing a lot of data from and SQL query that takes a long time to run. Since the SQL query takes so long to run, I am grabbing the data from the database in its most granular form. I then cycle through this data once and aggregate it in the forms that are useful to me.
My problem is that I am repeating myself over and over again. However, I am not sure of the best way to refactor this control flow. Thanks in advance!
def processClickOutData(cls, raw_data):
singles = {}
total={}
absolute_total = 0
channels = {}
singles_true = {}
total_true={}
channels_true = {}
absolute_total_true = 0
list_channels = set([])
list_tids = set([])
total_position = {}
total_position_true = {}
tid_position = {}
channel_position = {}
channel_position_true = {}
tid_position_true = {}
for row in raw_data:
gap=row[0]
count=row[1]
tid=row[2]
prefered=row[3]
channel=row[4]
position=row[5]
list_channels.add(channel)
list_tids.add(tid)
absolute_total += int(count)
if total.has_key(gap):
total[gap] += count
else:
total[gap] = count
if singles.has_key(gap) and singles[gap].has_key(tid):
singles[gap][tid] += count
elif singles.has_key(gap):
singles[gap][tid] = count
else:
singles[gap] = {}
singles[gap][tid] = count
if channels.has_key(gap) and channels[gap].has_key(channel):
channels[gap][channel] += count
elif channels.has_key(gap):
channels[gap][channel] = count
else:
channels[gap] = {}
channels[gap][channel] = count
if total_position.has_key(position):
total_position[position] += count
else:
total_position[position] = count
if tid_position.has_key(position) and tid_position[position].has_key(tid):
tid_position[position][tid] += count
elif tid_position.has_key(position):
tid_position[position][tid] = count
else:
tid_position[position] = {}
tid_position[position][tid] = count
if channel_position.has_key(position) and channel_position[position].has_key(channel):
channel_position[position][channel] += count
elif channel_position.has_key(position):
channel_position[position][channel] = count
else:
channel_position[position] = {}
channel_position[position][channel] = count
if prefered == 0:
absolute_total_true += count
if total_true.has_key(gap):
total_true[gap] += count
else:
total_true[gap] = count
if singles_true.has_key(gap) and singles_true[gap].has_key(tid):
singles_true[gap][tid] += count
elif singles_true.has_key(gap):
singles_true[gap][tid] = count
else:
singles_true[gap] = {}
singles_true[gap][tid] = count
if channels_true.has_key(gap) and channels_true[gap].has_key(channel):
channels_true[gap][channel] += count
elif channels_true.has_key(gap):
channels_true[gap][channel] = count
else:
channels_true[gap] = {}
channels_true[gap][channel] = count
if total_position_true.has_key(position):
total_position_true[position] += count
else:
total_position_true[position] = count
if tid_position_true.has_key(position) and tid_position_true[position].has_key(tid):
tid_position_true[position][tid] += count
elif tid_position_true.has_key(position):
tid_position_true[position][tid] = count
else:
tid_position_true[position] = {}
tid_position_true[position][tid] = count
if channel_position_true.has_key(position) and channel_position_true[position].has_key(channel):
channel_position_true[position][channel] += count
elif channel_position_true.has_key(position):
channel_position_true[position][channel] = count
else:
channel_position_true[position] = {}
channel_position_true[position][channel] = count
final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
"total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
return final_values
The entire structure you're using to store the data is probably wrong, but since I don't know how you're using it, I can't help you with that.
You can get rid of all of those has_key()
calls by using collections.defaultdict
. Note thedict.has_key(key)
is deprecated anyway, you should just use key in thedict
instead.
Look at how I change the for
loop too -- you can assign to names right in the for
statement, no need to do it separately.
from collections import defaultdict
def processClickOutData(cls, raw_data):
absolute_total = 0
absolute_total_true = 0
list_channels = set()
list_tids = set()
total = defaultdict(int)
total_true = defaultdict(int)
total_position = defaultdict(int)
total_position_true = defaultdict(int)
def defaultdict_int():
return defaultdict(int)
singles = defaultdict(defaultdict_int)
singles_true = defaultdict(defaultdict_int)
channels = defaultdict(defaultdict_int)
channels_true = defaultdict(defaultdict_int)
tid_position = defaultdict(defaultdict_int)
tid_position_true = defaultdict(defaultdict_int)
channel_position = defaultdict(defaultdict_int)
channel_position_true = defaultdict(defaultdict_int)
for gap, count, prefered, channel, position in raw_data:
list_channels.add(channel)
list_tids.add(tid)
absolute_total += count
total[gap] += count
singles[gap][tid] += count
channels[gap][channel] += count
total_position[position] += count
tid_position[position][tid] += count
channel_position[position][channel] += count
if prefered == 0:
absolute_total_true += count
total_true[gap] += count
singles_true[gap][tid] += count
channels_true[gap][channel] += count
total_position_true[position] += count
tid_position_true[position][tid] += count
channel_position_true[position][channel] += count
final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true,
"total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true }
return final_values
What this does is automatically fill in the correct default values if the keys don't exist. You've got two kinds here. Where you're adding int
s, you want to start with 0
if it doesn't exist -- that's what int
returns, hence defaultdict(int)
. Where you're adding a dictionary that adds int
s, you need to use a function that returns a defaultdict(int)
which is what defaultdict_int
does.
Edit: Suggested alternate dictionary structure:
position = defaultdict(lambda: defaultdict(defaultdict_int))
gap = defaultdict(lambda: defaultdict(defaultdict_int))
absolute_total = 0
for gap, count, prefered, channel, position in raw_data:
absolute_total += count
posd = position[position]
posd.setdefault('total', 0)
posd['total'] += count
posd['tid'][tid] += count
posd['channel'][channel] += count
gapd = gap[gap]
gapd.setdefault('total', 0)
gapd['total'] += count
gapd['tid'][tid] += count
gapd['channel'][channel] += count
Do the same with the _true
versions as well, and you've gone from 12 dict
s to 4.