I want to load multiple CSV files and run the same set of tests with pytest for each of them individually. Can anyone help me how to do this?
Many thanks in advance!
I tried to use a for loop, but that didn't work as intended.
import pandas as pd
file_names = ['file1.csv', 'file2.csv']
class TestCSV:
def __init__(self, data: pd.DataFrame) -> None:
self.data = pd.DataFrame
def test_na():
for col in self.data.columns:
assert self.data[col].isna().sum() == 0
for file in file_names:
df = pd.read_csv(file)
tests = TestCSV(df)
I have a feeling that you want each set of data[col]
to be a separate test case. If that is true, here is one way to approach this test:
#!/usr/bin/env python3
import pandas as pd
import pytest
def generate_data(file_names):
for file_name in file_names:
data = pd.read_csv(file_name)
for col_number, col in enumerate(data.columns):
yield pytest.param(data[col], id=f"file: {file_name}, col: {col_number}")
@pytest.mark.parametrize(
"data_col",
generate_data(["file1.csv", "file2.csv"]),
)
def test_na(data_col):
assert data_col.isna().sum() == 0
Output:
$ pytest -v
...
test_na.py::test_na[file: file1.csv, col: 0] PASSED
test_na.py::test_na[file: file1.csv, col: 1] PASSED
test_na.py::test_na[file: file2.csv, col: 0] PASSED
test_na.py::test_na[file: file2.csv, col: 1] PASSED
Notes
generate_data()
function will generate every columns in every filespytest.param()
provide a test case ID, which is useful to identify which file, which column failedenumerate()
function for that purpose