How to implement dependant columns in hypothesis dataframes

I am using hypothesis dataframes to implement a dataframe in which start_time and end_time are two columns. Here is a chunck:

import hypothesis.strategies as st
import logging
import datetime

from hypothesis import given
from hypothesis.extra.pandas import column, data_frames, range_indexes

current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)

datetime_st = st.integers(
    min_value=(current_time + datetime.timedelta(hours=4)).timestamp(),
    max_value=(current_time + datetime.timedelta(hours=20)).timestamp(),
)

df_columns = {
    # other fields omitted
    "start_time": {"elements": datetime_st, "unique": False},
    "end_time": {"elements": datetime_st, "unique": False},
}
test_dfs = data_frames(
    index=range_indexes(min_size=20, max_size=100),
    columns=[column(key, **value) for key, value in df_columns.items()],
)

@given(df=test_dfs)
def test_hyothesis(df):
    logging.info(df)
    assert 1

I am not able to find a solution to assert that each start_time should be greater than its corresponding end_time by atleast delta. I have tried composite, but I am not sure on how to implement it on each rows of the dataframes.

Is there a way that I enforce the delta as a rule when initialising start_time and end_time?

Solution

Here's a way to generate a dataframe of two time-stamp columns, where the difference between the first one and the second one is at least 3600 seconds (or some other amount of time). I'm using st.flatmap for that.

import hypothesis.strategies as st
from hypothesis.extra.pandas import column, data_frames, range_indexes, columns

current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp()

MIN_DIFF_SECONDS = 3600

two_timestamps_with_diff = st.integers(
    min_value = current_time + 3600 * 4, 
    max_value = current_time + 4600 * 20).flatmap(
    lambda n: st.tuples(
       st.integers(min_value = n, max_value=n), 
       st.integers(min_value = n + MIN_DIFF_SECONDS, max_value = n + 3600*10)
   ))

# sample code to examine the results of this strategy
# for _ in range(10):
#     x, y = two_timestamps_with_diff.example()
#     print(x, y, y-x)
    
test_dfs = data_frames(
    index=range_indexes(min_size=20, max_size=100),
    columns=columns(["start_time", "end_time"], dtype=int),
    rows=two_timestamps_with_diff, 
)

# sample code to examine the results of this strategy
# res = test_dfs.example()
# res.assign(d = res.end_time - res.start_time)

# a test with an assertion that validates this constraint. 
@given(df=test_dfs)
def test_hyothesis(df):
    logging.info(df)
    assert ((df.end_time - df.start_time) >= MIN_DIFF_SECONDS).all()
    
# run the test. It passes. 
test_hyothesis()

If you'd like to add additional columns to the autogenerated dataframe, do the following (the new columns are 'a' and 'b' in this example):

from hypothesis.strategies import composite

@composite
def test_df_with_additional_columns(draw, elements=test_dfs):
    df = draw(test_dfs)
    
    class GetIndex(st.SearchStrategy[pd.core.indexes.range.RangeIndex]): 
        def do_draw(self, _):
            return df.index    
    
    more_col_strategy = data_frames([column('A', dtype=int), 
                                     column('B', dtype=float)], 
                                    index = GetIndex()
                                   )
    
    more_cols = draw(more_col_strategy)
    
    return pd.concat([df, more_cols], axis=1)

test_df_with_additional_columns().example()