I have a dataframe containing some coded answers to a questionnaire.
Each row is a respondent and each column is a question. Not every person responded to the same questions because of some skip logic in the questionnaire therefore I have a sparse dataframe cotaining NaN
import pandas as pd
import numpy as np
# create dataframe
prng = np.random.RandomState(42)
df = pd.DataFrame(prng.randint(0,5, size=(10, 3),), columns=['Q1','Q2','Q3'])
# add NaN
df = df.mask(np.random.random(df.shape) < .3)
print(df)
df
Q1 Q2 Q3
0 3.0 4.0 2.0
1 4.0 4.0 NaN
2 2.0 NaN 2.0
3 4.0 NaN 2.0
4 4.0 NaN 3.0
5 NaN 3.0 4.0
6 0.0 NaN NaN
7 4.0 3.0 NaN
8 0.0 NaN 2.0
9 NaN 3.0 3.0
The dataframe is quite big around 40K rows and 700 columns... is there an efficient way to find the group(s) of at least n questions that were answered at least by x respondents.
Basic attempt:
list_answers = []
for L in range(len(df.columns) + 1):
for subset in itertools.combinations(df.columns, L):
list_answers.append([subset, df.loc[df[list(subset)].notna().all(axis=1)].index.to_list()])
list_answers[1:]
[[('Q1',), [0, 1, 2, 3, 4, 6, 7, 8]],
[('Q2',), [0, 1, 5, 7, 9]],
[('Q3',), [0, 2, 3, 4, 5, 8, 9]],
[('Q1', 'Q2'), [0, 1, 7]],
[('Q1', 'Q3'), [0, 2, 3, 4, 8]],
[('Q2', 'Q3'), [0, 5, 9]],
[('Q1', 'Q2', 'Q3'), [0]]]
min_num_questions = 2
min_num_respondents = 3
group_questions = [[i[0], i[1]] for i in list_answers[1:] if len(i[0])>=min_num_questions if len(i[1])>=min_num_respondents]
group_questions
[[('Q1', 'Q2'), [0, 6, 9]],
[('Q1', 'Q3'), [0, 1, 9]],
[('Q2', 'Q3'), [0, 2, 4, 5, 7, 9]]]
If you like to keep the information, of which questions were answered by each person, one possible approach is to drop all columns (=questions) first that do not meet the criteria. In a second step you could store the questions that were answered in a separate result column and group it.
import pandas as pd
import numpy as np
# create dataframe
prng = np.random.RandomState(42)
df = pd.DataFrame(prng.randint(0,5, size=(10, 3),), columns=['Q1','Q2','Q3'])
# add NaN
df = df.mask(np.random.random(df.shape) < .3)
### Q1 Q2 Q3
###0 3.0 4.0 2.0
###1 NaN 4.0 1.0
###2 NaN 2.0 2.0
###3 4.0 NaN 2.0
###4 4.0 1.0 NaN
###5 1.0 3.0 4.0
###6 0.0 3.0 1.0
###7 4.0 3.0 0.0
###8 0.0 NaN NaN
###9 1.0 3.0 3.0
min_num_questions = 2
min_num_respondents = 3
#drop questions that were NOT answered by at least x persons
dfResult = df.drop(df.columns[df.count(axis=0) < min_num_respondents], axis=1)
###dfResult matches df in this case
#store question names, in case at least x questions were answered
dfResult['ans'] = dfResult.apply(lambda row: tuple(dfResult.columns[row.notnull()].values) if row.count() >= min_num_questions else nan, axis=1)
### Q1 Q2 Q3 ans
###0 3.0 4.0 2.0 (Q1, Q2, Q3)
###1 NaN 4.0 1.0 (Q2, Q3)
###2 NaN 2.0 2.0 (Q2, Q3)
###3 4.0 NaN 2.0 (Q1, Q3)
###4 4.0 1.0 NaN (Q1, Q2)
###5 1.0 3.0 4.0 (Q1, Q2, Q3)
###6 0.0 3.0 1.0 (Q1, Q2, Q3)
###7 4.0 3.0 0.0 (Q1, Q2, Q3)
###8 0.0 NaN NaN NaN
###9 1.0 3.0 3.0 (Q1, Q2, Q3)
#group by answered questions
dictResult = dfResult.groupby('ans').indices
###{
###('Q1', 'Q2'): array([4], dtype=int64),
###('Q1', 'Q2', 'Q3'): array([0, 5, 6, 7, 9], dtype=int64),
###('Q1', 'Q3'): array([3], dtype=int64),
###('Q2', 'Q3'): array([1, 2], dtype=int64)
###}
In order to include supersets, you could step through the resulting dict and append the list with values, where applicable.
output = dictResult
for k,v in dictResult.items():
for subk in dictResult.keys():
if subk != k and set(subk).issuperset(k):
output[k] = np.append(output[k], dictResult[subk])
###{
###('Q1', 'Q2'): array([4, 0, 5, 6, 7, 9], dtype=int64),
###('Q1', 'Q2', 'Q3'): array([0, 5, 6, 7, 9], dtype=int64),
###('Q1', 'Q3'): array([3, 0, 5, 6, 7, 9], dtype=int64),
###('Q2', 'Q3'): array([1, 2, 0, 5, 6, 7, 9], dtype=int64)
###}