Search code examples
pythonpandasalgorithmscikit-learn

how can i make my sklearn prediction model better?


So basically, I have this model in sklearn that predicts the survival rate of titanic. its accuracy is around 0.77.

How can I make it better and more accurate?

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
le = LabelEncoder()
sc = StandardScaler()

train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
train_df["Embarked"].fillna("N", inplace=True)
train_df['Cabin'] = train_df['Cabin'].str[:1]
train_df['Cabin'].fillna('N', inplace=True)
train_df["Cabin"]
for col in ["Sex", "Embarked", "Cabin"]:
    train_df[col] = LabelEncoder().fit_transform(train_df[col])

x = train_df.drop(["PassengerId","Name","Ticket", "Survived"], axis=1)
y = train_df["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
dt_clf = DecisionTreeClassifier(max_depth= 5, min_samples_leaf= 1,min_samples_split= 2)
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)
print(metrics.accuracy_score(y_test, pred))

I filled the na with mean and changed the scaler and the algorithm, but nothing happened.


Solution

  • Handling Missing Values: You've filled the missing values in 'Age' with the mean. To take it a step further, consider using another methods to fill in missing values, such as the median, mode, or even using predictive models.

    Feature Engineering: You can create some new features. For example:

    • Family Size: Combine the 'sibsp' (number of siblings/spouses aboard) and 'parch' (number of parents/children aboard) columns to create a new feature that represents the size of the family.
    • Title: Extract titles from the passenger names (like Mr, Mrs, Miss) and use them as a feature.
    • IsAlone: Create a binary feature that indicates whether the passenger was traveling alone.

    Scaling and Encoding: You're currently using LabelEncoder for the categorical variables. Consider using OneHotEncoder or pd.get_dummies for better handling of categorical variables. Also, make sure to apply appropriate scaling techniques.

    Feature Selection: You should select the most important features. You can use methods like recursive feature elimination (RFE), feature importance from models, or correlation matrices to drop the less important features.

    Model Selection and Hyperparameter Tuning: You're using a DecisionTreeClassifier. Maybe you can try different algorithms and use cross-validation to fine-tune the hyperparameters. This will help you find the best combination for your model.

    Cross-Validation and Hyperparameter Tuning: To find the best hyperparameters for your model, consider using GridSearchCV or RandomizedSearchCV.

    Look at my reconstructed code:

    import seaborn as sns
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from xgboost import XGBClassifier
    from sklearn.metrics import accuracy_score
    
    # Load Titanic dataset from Seaborn
    titanic = sns.load_dataset('titanic')
    print("Dataset loaded.")
    
    # Feature engineering
    titanic['FamilySize'] = titanic['sibsp'] + titanic['parch']
    titanic['IsAlone'] = (titanic['FamilySize'] == 0).astype(int)
    titanic['deck'] = titanic['deck'].cat.add_categories('N').fillna('N')
    print("Feature engineering completed.")
    
    # Dropping features
    dropped_features = ['alive', 'adult_male', 'embark_town', 'alone']
    titanic.drop(dropped_features, axis=1, inplace=True)
    print(f"Dropped less useful features: {dropped_features}")
    
    # Preprocessing
    numeric_features = ['age', 'fare', 'FamilySize']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])
    
    categorical_features = ['sex', 'deck', 'embarked', 'who', 'class']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    print("Preprocessing setup completed.")
    
    # Splitting data
    X = titanic.drop('survived', axis=1)
    y = titanic['survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data split into training and testing sets.")
    
    # Define models and their hyperparameters
    models = {
        'RandomForest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'classifier__n_estimators': [100, 200],
                'classifier__max_depth': [5, 10],
                'classifier__min_samples_split': [2, 5],
                'classifier__min_samples_leaf': [1, 2]
            }
        },
        'LogisticRegression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {
                'classifier__C': [0.01, 0.1, 1, 10],
                'classifier__solver': ['lbfgs', 'liblinear']
            }
        },
        'SVM': {
            'model': SVC(),
            'params': {
                'classifier__C': [0.1, 1, 10],
                'classifier__gamma': ['scale', 'auto'],
                'classifier__kernel': ['linear', 'rbf']
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {
                'classifier__n_estimators': [100, 200],
                'classifier__learning_rate': [0.01, 0.1],
                'classifier__max_depth': [3, 5]
            }
        },
        'XGBoost': {
            'model': XGBClassifier(random_state=42),
            'params': {
                'classifier__n_estimators': [100, 200],
                'classifier__learning_rate': [0.01, 0.1],
                'classifier__max_depth': [3, 5]
            }
        }
    }
    
    # Function to perform grid search and return the best model
    def perform_grid_search(X_train, y_train, model, params):
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', model)])
        grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        return grid_search
    
    best_models = {}
    for model_name, model_info in models.items():
        print(f"Training {model_name}...")
        best_models[model_name] = perform_grid_search(X_train, y_train, model_info['model'], model_info['params'])
        print(f"{model_name} training completed.")
    
    # Evaluate models
    for model_name, model in best_models.items():
        best_model = model.best_estimator_
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{model_name} Best Hyperparameters: {model.best_params_}')
        print(f'{model_name} Accuracy: {accuracy}')
    
    # Select the best model based on accuracy
    best_model_name = max(best_models, key=lambda name: accuracy_score(y_test, best_models[name].best_estimator_.predict(X_test)))
    print(f'Best model: {best_model_name} with accuracy {accuracy_score(y_test, best_models[best_model_name].best_estimator_.predict(X_test))}')
    

    My accuracy is about 0.821