I have a big dataframe df that I need to clean from bad data using four different conditions:
A
is a numbernan
B
belongs to list1
C
belongs to list2
I would like to check these conditions without a loop because it runs very slowly. This is the code I wrote:
df1=pd.read_csv(data1)
list1=pd.read_csv(data2)
list2=pd.read_csv(data3)
col_df= #list of target column names (it's not important to specify what they are)
df2=pd.DataFrame(np.zeros(shape=(1,len(col_df))),columns=col_df)
row_df3=pd.DataFrame(np.zeros(shape=(1,len(col_df))),columns=col_df) #I use it to append new zero row to previous
k1=0 #index to add new row to df2
for j in range(len(df1)):
try:
float(df1['A'][j])
check=1
except:
check=0
if check==1: #if the element is a number
if df1['B'][j] in list1['B'].unique(): #if the element is in the first list
if str(df1['A'][j])!='nan': #if the element value is different from nan
if da['C'][j] in list2['C'].unique(): #if the element is in the second list
iii=list1.index[list1['B']==df1['B'][j]] #I take the index in list1
iii=iii[0]
df2.loc[k1]=row_df2.loc[0] #new zero row
df2.loc[k1]=df1.iloc[j] #assign elements to new df2 with same column name of df1
df2['B'][k1]=list1['B'][iii] #assign element from list1
k1=k1+1 #new row for df2
It works but slowly. Is there a way to write all this conditions without using a loop? If yes I also thought to use drop
function instead of creating a new dataframe df2
.
Thank you in advance!
I tried to do this:
def is_float(element: any) -> bool:
#If you expect None to be passed:
if element is None:
return False
try:
float(element)
return True
except ValueError:
return False
df1['Bool']=df1['A'].apply(is_float) #to create a new column that check if it's a number
df2=df1[df1['Bool']==True and df1['B'] in list1['B'].unique() and da['C'] in list2['C'].unique()] #to check all other conditions
without defining df2
previous this time, but it doesn't work.. I get two errors:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()
and if I erase the first condition, so
df2=df1[df1['B'] in list1['B'].unique() and da['C'] in list2['C'].unique()]
I get:
ValueError: ('Lengths must match to compare')
using apply and those functions you should be able to make a column grouping all conditions together
def is_null(value):
# Null values are considered as float but nan is not equal to nan
if pd.isna(value):
return True
else:
return False
def is_float(value):
if type(value) == float:
return True
else:
return False
def in_list1(value):
if value in list1:
return True
else:
return False
def in_list2(value):
if value in list2:
return True
else:
return False