Search code examples
pythonpandasdatetimexlrd

How to apply python code for all the cells in the column?


Currently I'm running this below code for single record but I want run this code for all the cells in the column. Please help on this request.

from datetime import datetime
import xlrd

Date= 44774.45833333
New_date = datetime (*xlrd.xldate_as_tuple(excel_date,0))
print(New_date)

#rsult is 2022-08-01 11:00:00

Now I have a column called RETA where I want to apply this code to all the cells in this column and change the numeric format to datetime format.

sample data:

Theater |         RETA                 TPM SLACycletime
----------------------------------------------------------
US      |   2022-09-22 15:33:00    | Invalid Data
US      |   44774.45833            | 558:19:30
US      |   2022-09-2022 18:03:00  | 111:44:26
US      |                          | 15:44:26
US      |   1/8/2022  10:00:00 AM  | Invalid Data

Solution

  • As you want a datetime, use xlrd.xldate_as_datetime directly.

    The easy (but non-vectorized) answer is:

    df = pd.DataFrame({'RETA': [None, 0, 1, 10, 61, 10000, 44774.45833333, 44774.5]})
    
    df['date'] = df['RETA'].apply(lambda date: xlrd.xldate_as_datetime(date, 0)
                                               if pd.notna(date) else date)
    

    Output:

               RETA                date
    0           NaN                 NaT
    1      0.000000 1899-12-31 00:00:00
    2      1.000000 1900-01-01 00:00:00
    3     10.000000 1900-01-10 00:00:00
    4     60.000000 1900-02-28 00:00:00
    5     61.000000 1900-03-01 00:00:00
    6  10000.000000 1927-05-18 00:00:00
    7  44774.458333 2022-08-01 11:00:00
    8  44774.510000 2022-08-01 12:14:24
    

    For a vectorized version, let's rewite xlrd.xldate_as_datetime's code to work on a Series:

    def vectorized_xldate(date, mode=0):
        epoch = pd.Timestamp('1970-01-01')
        if mode:
            epoch -= pd.Timestamp('1904-01-01')
        else:
            epoch = (pd.Series(pd.Timestamp('1899-12-31'), index=date.index)
                       .where(date<60, pd.Timestamp('1899-12-30'))
                       .rsub(epoch)
                    )
        return (pd.to_datetime(date.mul(86400).round(3), unit='s')
                  .sub(epoch).round('us')
                )
    
    df['date_vectorized'] = vectorized_xldate(df['RETA'], mode=0)
    
    # or
    # df['date_vectorized'] = vectorized_xldate(df['RETA'])
    

    Example with the two modes:

    # mode=0
                RETA              date_mode0   date_mode0_vectorized
    0            NaN                     NaT                     NaT
    1       0.000000 1899-12-31 00:00:00.000 1899-12-31 00:00:00.000
    2       1.000000 1900-01-01 00:00:00.000 1900-01-01 00:00:00.000
    3      10.000000 1900-01-10 00:00:00.000 1900-01-10 00:00:00.000
    4      59.000000 1900-02-28 00:00:00.000 1900-02-28 00:00:00.000
    5      59.999999 1900-02-28 23:59:59.914 1900-02-28 23:59:59.914
    6      60.000000 1900-02-28 00:00:00.000 1900-02-28 00:00:00.000
    7      61.000000 1900-03-01 00:00:00.000 1900-03-01 00:00:00.000
    8   10000.000000 1927-05-18 00:00:00.000 1927-05-18 00:00:00.000
    9   44774.458333 2022-08-01 11:00:00.000 2022-08-01 11:00:00.000
    10  44774.123457 2022-08-01 02:57:46.667 2022-08-01 02:57:46.667
    
    # mode=1
                RETA              date_mode1   date_mode1_vectorized
    0            NaN                     NaT                     NaT
    1       0.000000 1904-01-01 00:00:00.000 1904-01-01 00:00:00.000
    2       1.000000 1904-01-02 00:00:00.000 1904-01-02 00:00:00.000
    3      10.000000 1904-01-11 00:00:00.000 1904-01-11 00:00:00.000
    4      59.000000 1904-02-29 00:00:00.000 1904-02-29 00:00:00.000
    5      59.999999 1904-02-29 23:59:59.914 1904-02-29 23:59:59.914
    6      60.000000 1904-03-01 00:00:00.000 1904-03-01 00:00:00.000
    7      61.000000 1904-03-02 00:00:00.000 1904-03-02 00:00:00.000
    8   10000.000000 1931-05-19 00:00:00.000 1931-05-19 00:00:00.000
    9   44774.458333 2026-08-02 11:00:00.000 2026-08-02 11:00:00.000
    10  44774.123457 2026-08-02 02:57:46.667 2026-08-02 02:57:46.667
    

    handling mixed formats

    def vectorized_xldate(date, mode=0, errors='raise'):
        epoch = pd.Timestamp('1970-01-01')
        if errors == 'coerce':
            dt = pd.to_datetime(date, format='mixed', errors='coerce')
            date = pd.to_numeric(date, errors='coerce')
        else:
            dt = float('nan')
    
        if mode:
            epoch -= pd.Timestamp('1904-01-01')
        else:
            epoch = (pd.Series(pd.Timestamp('1899-12-31'), index=date.index)
                       .where(date<60, pd.Timestamp('1899-12-30'))
                       .rsub(epoch)
                    )
        return (pd.to_datetime(date.mul(86400).round(3), unit='s')
                  .sub(epoch).round('us')
                  .fillna(dt)
                )
    
    df['date'] = vectorized_xldate(df['RETA'], errors='coerce').dt.round('S')
    

    Output:

      Theater                   RETA TPM SLACycletime                date
    0      US    2022-09-22 15:33:00     Invalid Data 2022-09-22 15:33:00
    1      US            44774.45833        558:19:30 2022-08-01 11:00:00
    2      US  2022-09-2022 18:03:00        111:44:26                 NaT
    3      US               15:44:26             None 2023-08-04 15:44:26
    4      US   1/8/2022 10:00:00 AM     Invalid Data 2022-01-08 10:00:00