Search code examples
pythonpandasvectorization

Why isn't vectorization making this faster?


Below I have two versions of code to get to the same output - I am running through a list of possible number combinations, finding which filters would make each combination true, and then finding only unique filters (e.g., filter (5,6) (5,6) (5,6) is not unique, in that it would 'pass' all the same combinations as filter (5,7) (5,7) (5,7)

I was reading that vectorization can speed up processes, so I've tried that below, after my simple for loop. The vectorization is much slower. So much slower, I must be doing something wrong (but still getting the same correct result).

Any help would be appreciated. (Also, not sure why finding unique values after the vectorization took so long in the output below)

#########################################
### Compare for loop to vectorization ###
#########################################
import pandas as pd
import itertools
import numpy as np
import time

size_value = 8
#df = pd.DataFrame({'var_1_lower_limit':[1,2,1,3,2],'var_1_upper_limit':[5,5,4,7,6],'var_2_lower_limit':[5,5,3,6,4],'var_2_upper_limit':[9,9,8,9,7],'var_3_lower_limit':[6,7,4,2,4],'var_3_upper_limit':[8,8,6,8,7]})
df = pd.DataFrame({'var_1_lower_limit':np.random.randint(0,5,size=size_value),'var_1_upper_limit':np.random.randint(5,10,size=size_value),'var_2_lower_limit':np.random.randint(0,5,size=size_value),'var_2_upper_limit':np.random.randint(5,10,size=size_value),'var_3_lower_limit':np.random.randint(0,5,size=size_value),'var_3_upper_limit':np.random.randint(5,10,size=size_value)})
df.sort_values(['var_1_lower_limit','var_1_upper_limit','var_2_lower_limit','var_2_upper_limit','var_3_lower_limit','var_3_upper_limit'],ascending=True, inplace=True) # I think important to sort here
display(df)


########## for loop
checkpoint_01 = time.perf_counter()
print('Beginning for loop run...')
default_possibilities = [0,1,2,3,4,5,6,7,8,9]
possibilities = list(itertools.product(default_possibilities,default_possibilities,default_possibilities,default_possibilities,default_possibilities,default_possibilities)) #Creates all combinations of possibilities
print('length of possibilities',len(possibilities))
possibilities = [item for item in possibilities if item[1]>=item[0] and item[3]>=item[2] and item[5]>=item[4]] # removes combinations that don't make sense such as first value is 5 and second value is 3
print('length of revised possibilities',len(possibilities))

filtering_output = []
checkpoint_02 = time.perf_counter()
for i in possibilities:
  a = i[0]
  b = i[1]
  c = i[2]
  d = i[3]
  e = i[4]
  f = i[5]
  for z in df.itertuples():
    if z.var_1_lower_limit <=a and z.var_1_upper_limit >=b and z.var_2_lower_limit <=c and z.var_2_upper_limit >=d and z.var_3_lower_limit <=e and z.var_3_upper_limit >=f:
      filtering_output.append({'combination':i,'var_1_lower_limit':z.var_1_lower_limit,'var_1_upper_limit':z.var_1_upper_limit, 'var_2_lower_limit':z.var_2_lower_limit,'var_2_upper_limit':z.var_2_upper_limit,'var_3_lower_limit':z.var_3_lower_limit,'var_3_upper_limit':z.var_3_upper_limit,'pass_or_fail':'YES'})
    else:
      continue

filtering_output_df = pd.DataFrame(filtering_output)
checkpoint_03 = time.perf_counter()
print('Iteration loop time was',(checkpoint_03-checkpoint_02)/60,'minutes')
unique_combinations = filtering_output_df.drop_duplicates(subset=['combination'], keep='first')
final_filters = unique_combinations.drop_duplicates(subset=['var_1_lower_limit','var_1_upper_limit','var_2_lower_limit','var_2_upper_limit','var_3_lower_limit','var_3_upper_limit'], keep='first') # since df was sorted, keeping only filters that had a passing combination
checkpoint_04 = time.perf_counter()
print('Done finding unique',(checkpoint_04-checkpoint_03)/60,'minutes')
display(final_filters)
checkpoint_05 = time.perf_counter()
print('Total time for loop',(checkpoint_05-checkpoint_01)/60,'minutes')

######### vectorization
checkpoint_06 = time.perf_counter()
print('Starting vectoriation...')
default_possibilities_new = np.array(list(itertools.product(range(10), repeat=6)))
print('Length of default possibilities',len(default_possibilities_new))

mask_new = (default_possibilities_new[:, 1] >= default_possibilities_new[:, 0]) & \
       (default_possibilities_new[:, 3] >= default_possibilities_new[:, 2]) & \
       (default_possibilities_new[:, 5] >= default_possibilities_new[:, 4])

possibilities_new = default_possibilities_new[mask_new]
print('length of revised possibilties',len(possibilities_new))

filtering_output_new = []
checkpoint_07 = time.perf_counter()
for i in possibilities_new:
    mask_new = (
            (df['var_1_lower_limit'] <= i[0]) & (df['var_1_upper_limit'] >= i[1]) &
            (df['var_2_lower_limit'] <= i[2]) & (df['var_2_upper_limit'] >= i[3]) &
            (df['var_3_lower_limit'] <= i[4]) & (df['var_3_upper_limit'] >= i[5])
    )

    if mask_new.any():
        row = df[mask_new].iloc[0]
        filtering_output_new.append({
            'combination': i,
            'var_1_lower_limit': row['var_1_lower_limit'],
            'var_1_upper_limit': row['var_1_upper_limit'],
            'var_2_lower_limit': row['var_2_lower_limit'],
            'var_2_upper_limit': row['var_2_upper_limit'],
            'var_3_lower_limit': row['var_3_lower_limit'],
            'var_3_upper_limit': row['var_3_upper_limit'],
            'pass_or_fail': 'YES'
        })

filtering_output_df_new = pd.DataFrame(filtering_output_new)
checkpoint_08 = time.perf_counter()
print('Done with vectorization',(checkpoint_08-checkpoint_06)/60,'minutes')
unique_combinations_new = filtering_output_df_new.drop_duplicates(subset=['combination'], keep='first')
final_filters_new = unique_combinations_new.drop_duplicates(subset=['var_1_lower_limit', 'var_1_upper_limit', 'var_2_lower_limit', 'var_2_upper_limit', 'var_3_lower_limit','var_3_upper_limit'], keep='first')
checkpoint_09 = time.perf_counter()
print('Done finding unique',(checkpoint_09-checkpoint_08)/60,'minutes')
display(final_filters_new)

checkpoint_10 = time.perf_counter()
print("Vectorization done in", (checkpoint_10 - checkpoint_06)/60,'minutes')

Output enter image description here


Solution

  • Improved 'Done with vectorization' by removing the loop. The frame data is turned into a numpy array. mask_faster gets a boolean mask for each row (each column is a mask for selecting rows from the dataframe - this is what you did in the loop).

    In bl there is a mask if at least one is true. index indexes for retrieving array rows from arr. mask_faster is overwritten by masks bl: mask_faster[:, bl]. In new_df, the necessary values are filled in through list comprehension. As a result, 60-70 times faster.

    import numpy as np
    import pandas as pd
    import time
    
    dpn = default_possibilities_new[mask_new]
    
    checkpoint_mask_faster = time.perf_counter()
    
    mask_faster = (
                (df['var_1_lower_limit'].values[:, np.newaxis] <= dpn[:, 0]) &
                (df['var_1_upper_limit'].values[:, np.newaxis]  >= dpn[:, 1]) &
                (df['var_2_lower_limit'].values[:, np.newaxis]  <= dpn[:, 2]) &
                (df['var_2_upper_limit'].values[:, np.newaxis]  >= dpn[:, 3]) &
                (df['var_3_lower_limit'].values[:, np.newaxis]  <= dpn[:, 4]) &
                (df['var_3_upper_limit'].values[:, np.newaxis]  >= dpn[:, 5])
        )
    
    bl = np.any(mask_faster, axis=0)
    index = np.where(bl)[0]
    arr = df.values
    mask_faster = mask_faster[:, bl]
    
    new_df = pd.DataFrame([{'combination': dpn[index[i]],
                            'var_1_lower_limit': arr[mask_faster[:,i]][0][0],
                            'var_1_upper_limit': arr[mask_faster[:,i]][0][1],
                            'var_2_lower_limit': arr[mask_faster[:,i]][0][2],
                            'var_2_upper_limit': arr[mask_faster[:,i]][0][3],
                            'var_3_lower_limit': arr[mask_faster[:,i]][0][4],
                            'var_3_upper_limit': arr[mask_faster[:,i]][0][5],
                            'pass_or_fail': 'YES'
                            } for i in range(mask_faster.shape[1])])
    
    print('Done with mask_faster',(time.perf_counter()-checkpoint_mask_faster)/60,'minutes')
    

    You can also check your dataframe result with mine:

    print('comparison of dataframes', new_df.equals(filtering_output_df_new))
    

    The slowest part is unique_combinations_new, I think because there is a list in each line. I tried it using np.unique and I’m not exactly sure if it works the same as drop_duplicates, but it works very quickly. You need to check this:

    ind_uniq = np.unique(dpn[index], axis=0, return_index=True)[1]
    
    unique_combinations_new = new_df.loc[ind_uniq, :]