I have the below DataFrame. I am looking for a way to automatically mask and choose values from specific row and columns without considering the nan values.
data=pd.DataFrame([[ np.nan, 0. , np.nan, 3. , 77. ],
[ 5.6, 40. , 12. , 9. , np.nan],
[ 5.9, np.nan, 5. , 5. , 59. ],
[ 4.8, 30. , np.nan, 11. , 30. ],
[ 2.2, 6. , 15. , np.nan, 5. ]])
For example select rows 0 and 3 and columns 1,3, and 4 as below:
data_selected=pd.DataFrame([[ 0, 3, 77 ],[ 30, 11, 30 ]], index=[0,3],columns=[1,3,4])
Here is my solution. If you have better solution please give me your answer
import pandas as pd
from scipy.stats import pearsonr
from sklearn import metrics
import numpy as np
data=pd.DataFrame([[ np.nan, 0. , np.nan, 3. , 77. ],
[ 5.6, 40. , 12. , 9. , np.nan],
[ 5.9, np.nan, 5. , 5. , 59. ],
[ 4.8, 30. , np.nan, 11. , 30. ],
[ 2.2, 6. , 15. , np.nan, 5. ]])
# interate over the data frame to find best combination of row and column for extraction
size_list=[]
index_list=[]
dfs=[]
for i in data.index:
print(i)
# interate over the dataframe and remove columns based on nan values in each row
boolean_ind=data.loc[i, :].isnull()
ex7=data[data.columns[boolean_ind==False]]
# after removing the columns with nan value in row, use dropna command to romove other nan values from the selected dataframe (ex7)
ex7_drop=ex7.dropna()
# append each dataframe into the list to choose it after finding the best dataframe based on the size
dfs.append(ex7_drop)
# compute shape of the selected dataframe to choose the beset dataframe based on the size
size=ex7_drop.shape[0]*ex7_drop.shape[1]
size_list.append(size)
# put row index of each dataframe into the index_list to easily select the selected rows from the base dataframe
index_list.append(ex7_drop.index)
# select best index based on the max size
max_size_index=index_list[size_list.index(max(size_list))]
selected_df=dfs[size_list.index(max(size_list))]