Search code examples
pythonpandasmethod-chaining

copy a dataframe to new variable with method chaining


Is it possible to copy a dataframe in the middle of a method chain to a new variable? Something like:

import pandas as pd

df = (pd.DataFrame([[2, 4, 6],
                    [8, 10, 12],
                    [14, 16, 18],
                    ])
      .assign(something_else=100)
      .div(2)
      .copy_to_new_variable(df_imag)  # Imaginated method to copy df to df_imag.
      .div(10)
      )

print(df_imag) would then return:

    0   1   2   something_else
0   1.0 2.0 3.0 50.0
1   4.0 5.0 6.0 50.0
2   7.0 8.0 9.0 50.0

.copy_to_new_variable(df_imag) could be replaced by df_imag = df.copy() but this would result in compromising the method chain.


Solution

  • Creating variables dynamically is not a good idea, but you can easily take advantage of mutable objects like dictionaries.

    Adding a new DataFrame method to do this seamlessly:

    from pandas.core.base import PandasObject
    
    ### this only needs to be done once per session
    def to_name(df, dic, name, copy=False):
        dic[name] = df.copy() if copy else df
        return df
        
    PandasObject.to_name = to_name
    ###
    
    tmp = {}
    
    df = (pd.DataFrame([[2, 4, 6],
                        [8, 10, 12],
                        [14, 16, 18],
                        ])
          .assign(something_else=100)
          .div(2)
          .to_name(tmp, 'after_div2', copy=True)
          .div(10)
          )
    
    print(tmp['after_div2'])
    
    print(df)
    

    Output:

    # tmp['after_div2']
         0    1    2  something_else
    0  1.0  2.0  3.0            50.0
    1  4.0  5.0  6.0            50.0
    2  7.0  8.0  9.0            50.0
    
    # df
         0    1    2  something_else
    0  0.1  0.2  0.3             5.0
    1  0.4  0.5  0.6             5.0
    2  0.7  0.8  0.9             5.0
    

    If you don't want to monkey patch the DataFrame objects, use pipe:

    def to_name(df, dic, name, copy=False):
        dic[name] = df.copy() if copy else df
        return df
    
    tmp = {}
    
    df = (pd.DataFrame([[2, 4, 6],
                        [8, 10, 12],
                        [14, 16, 18],
                        ])
          .assign(something_else=100)
          .div(2)
          .pipe(to_name, tmp, 'after_div2')
          .div(10)
          .pipe(lambda df: print('\nQuick alternative:', df, sep='\n') or df)
          )
    
    print(tmp['after_div2'])
    

    printing

    In the same line you can also add a chainable print method, or again use a lambda in pipe:

    from pandas.core.base import PandasObject
    
    ### this only needs to be done once per session
    def df_print(df, *args):
        if args:
            print(*args)
        print(df)
        return df
        
    PandasObject.print = df_print
    ###
    
    df = (pd.DataFrame([[2, 4, 6],
                        [8, 10, 12],
                        [14, 16, 18],
                        ])
          .print()
          .assign(something_else=100)
          .div(2)
          .print('\nAfter 2:')
          .div(10)
          .pipe(lambda df: print('\nQuick alternative:', df, sep='\n') or df)
          )
    

    Output:

        0   1   2
    0   2   4   6
    1   8  10  12
    2  14  16  18
    
    After 2:
         0    1    2  something_else
    0  1.0  2.0  3.0            50.0
    1  4.0  5.0  6.0            50.0
    2  7.0  8.0  9.0            50.0
    
    Quick alternative:
         0    1    2  something_else
    0  0.1  0.2  0.3             5.0
    1  0.4  0.5  0.6             5.0
    2  0.7  0.8  0.9             5.0
    

    As a module

    You could also create a module:

    pandas_debug.py

    from pandas.core.base import PandasObject
    
    def df_print(df, *args):
        if args:
            print(*args)
        print(df)
        return df
        
    PandasObject.print = df_print
    
    def to_name(df, dic, name, copy=False):
        dic[name] = df.copy() if copy else df
        return df
    
    PandasObject.to_name = to_name
    

    Then in your code:

    import pandas as pd
    import pandas_debug
    
    tmp = {}
    df = (pd.DataFrame([[2, 4, 6],
                        [8, 10, 12],
                        [14, 16, 18],
                        ])
          .assign(something_else=100)
          .div(2)
          .to_name(tmp, 'after_div2')
          .div(10)
          .print()
          )