How can I refactor the following code to make sure it easier to read and better using a function. Can reproduce code and data frames used using GitHub posted csv used on my github.
import numpy as np
import pandas as pd
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv.csv")
combine = [train_df, test_df]
guess_ages = np.zeros((2, 3))
for df in combine:
for i in range(0, 2):
for j in range(0, 3):
guess_df = df[(df['Sex'] == i) & (
df['Pclass'] == j + 1)]['Age'].dropna()
age_guess = guess_df.median()
guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5
for i in range(0, 2):
for j in range(0, 3):
df.loc[(df.Age.isnull()) & (df.Sex == i) & (
df.Pclass == j + 1), 'Age'] = guess_ages[i, j]
df.Age = df.Age.astype(int)
IIUC, what you want is to replace the Age
by a formula for each group ('Sex', 'Pclass')
when the Age
is null:
import numpy as np
import pandas as pd
train_df = pd.read_csv('train_df.csv', index_col=0)
test_df = pd.read_csv('test_df.csv', index_col=0)
guess_age = lambda x: int(x.median() / 0.5 + 0.5) * 0.5
train_df['Age'] = train_df['Age'].fillna(train_df.groupby(['Sex', 'Pclass'])['Age']
test_df['Age'] = test_df['Age'].fillna(test_df.groupby(['Sex', 'Pclass'])['Age']
>>> train_df['Age'].isna().sum()
>>> test_df['Age'].isna().sum()
>>> train_df['Age'].isna().sum()
>>> test_df['Age'].isna().sum()