Search code examples
python-3.xencodingscikit-learndecision-treeone-hot-encoding

Performing one hot encoding on two columns of string data


I am trying to predict 'Full_Time_Home_Goals'

My code is:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import os
import xlrd
import datetime
import numpy as np

# Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Pandas needs you to define the column as date before its imported and then call the column and define as a date
# hence this step.
date_col = ['Date']
df = pd.read_csv(
    r'C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt1\Historical Data\Concat_Cleaned.csv'
    , parse_dates=date_col, skiprows=0, low_memory=False)

# Converting/defining the columns
# Before you define column types, you need to fill all NaN with a value. We will be reconverting them later
df = df.fillna(101)
# Defining column types
convert_dict = {'League_Division': str,
                'HomeTeam': str,
                'AwayTeam': str,
                'Full_Time_Home_Goals': int,
                'Full_Time_Away_Goals': int,
                'Full_Time_Result': str,
                'Half_Time_Home_Goals': int,
                'Half_Time_Away_Goals': int,
                'Half_Time_Result': str,
                'Attendance': int,
                'Referee': str,
                'Home_Team_Shots': int,
                'Away_Team_Shots': int,
                'Home_Team_Shots_on_Target': int,
                'Away_Team_Shots_on_Target': int,
                'Home_Team_Hit_Woodwork': int,
                'Away_Team_Hit_Woodwork': int,
                'Home_Team_Corners': int,
                'Away_Team_Corners': int,
                'Home_Team_Fouls': int,
                'Away_Team_Fouls': int,
                'Home_Offsides': int,
                'Away_Offsides': int,
                'Home_Team_Yellow_Cards': int,
                'Away_Team_Yellow_Cards': int,
                'Home_Team_Red_Cards': int,
                'Away_Team_Red_Cards': int,
                'Home_Team_Bookings_Points': float,
                'Away_Team_Bookings_Points': float,
                }

df = df.astype(convert_dict)

# Reverting the replace values step to get original dataframe and with the defined filetypes
df = df.replace('101', np.NAN, regex=True)
df = df.replace(101, np.NAN, regex=True)

# Exploration
print(df.dtypes)
print(df)

# Clean dataset by dropping null rows
data = df.dropna(axis=0)

# Column that you want to predict = y
y = df.Full_Time_Home_Goals

# Columns that are inputted into the model to make predictions (dependants), Cannot be column y
features = ['HomeTeam', 'AwayTeam', 'Full_Time_Away_Goals', 'Full_Time_Result']
# Create X
X = df[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
soccer_model = DecisionTreeRegressor(random_state=1)

# Fit Model
soccer_model.fit(train_X, train_y)

I am getting an error fitting to the model

# Fit Model
soccer_model.fit(train_X, train_y)

Throws me an error:

ValueError: could not convert string to float: "Nott'm Forest"

How can I solve this and run the model to get the output? I tried to follow a few examples but I am unable to progress.

You can fine the example concat_cleaned file here


Solution

  • You have to transform your categorical data into numerical data. For that, you could use the OneHotEncoder:

    import os
    import xlrd
    import datetime
    import numpy as np
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.preprocessing import OneHotEncoder
    
    # Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
    pd.set_option('display.max_rows', 5000)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    
    # Pandas needs you to define the column as date before its imported and then call the column and define as a date
    # hence this step.
    date_col = ['Date']
    df = pd.read_csv(
        r'Concat_Cleaned_Example.csv'
        , parse_dates=date_col, skiprows=0, low_memory=False)
    
    # Converting/defining the columns
    # Before you define column types, you need to fill all NaN with a value. We will be reconverting them later
    df = df.fillna(101)
    # Defining column types
    convert_dict = {'League_Division': str,
                    'HomeTeam': str,
                    'AwayTeam': str,
                    'Full_Time_Home_Goals': int,
                    'Full_Time_Away_Goals': int,
                    'Full_Time_Result': str,
                    'Half_Time_Home_Goals': int,
                    'Half_Time_Away_Goals': int,
                    'Half_Time_Result': str,
                    'Attendance': int,
                    'Referee': str,
                    'Home_Team_Shots': int,
                    'Away_Team_Shots': int,
                    'Home_Team_Shots_on_Target': int,
                    'Away_Team_Shots_on_Target': int,
                    'Home_Team_Hit_Woodwork': int,
                    'Away_Team_Hit_Woodwork': int,
                    'Home_Team_Corners': int,
                    'Away_Team_Corners': int,
                    'Home_Team_Fouls': int,
                    'Away_Team_Fouls': int,
                    'Home_Offsides': int,
                    'Away_Offsides': int,
                    'Home_Team_Yellow_Cards': int,
                    'Away_Team_Yellow_Cards': int,
                    'Home_Team_Red_Cards': int,
                    'Away_Team_Red_Cards': int,
                    'Home_Team_Bookings_Points': float,
                    'Away_Team_Bookings_Points': float,
                    }
    
    df = df.astype(convert_dict)
    
    # Reverting the replace values step to get original dataframe and with the defined filetypes
    df = df.replace('101', np.NAN, regex=True)
    df = df.replace(101, np.NAN, regex=True)
    
    # Clean dataset by dropping null rows
    data = df.dropna(axis=0)
    
    # Column that you want to predict = y
    y = df.Full_Time_Home_Goals
    
    # Columns that are inputted into the model to make predictions (dependants), Cannot be column y
    features = ['HomeTeam', 'AwayTeam', 'Full_Time_Away_Goals', 'Full_Time_Result']
    # Create X
    X = df[features]
    
    # Split into validation and training data
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    
    # Specify Model
    soccer_model = DecisionTreeRegressor(random_state=1)
    
    # Define and train OneHotEncoder to transform numerical data to a numeric array
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(train_X)
    
    transformed_train_X = enc.transform(train_X)
    
    # Fit Model
    soccer_model.fit(transformed_train_X, train_y)
    

    That way your data, for instance (Man United,Newcastle,0,H) would be encoded as

    (0, 14) 1.0
    (0, 35) 1.0
    (0, 43) 1.0
    (0, 50) 1.0
    

    You can have a look at it for any data point to verify that it is correctly encoded, by using:

    entry_id = 1
    
    print(transformed_train_X[entry_id])
    
    for i in range(0,transformed_train_X[0].shape[1]):
      if(transformed_train_X[entry_id,i]==1.0):
        print(enc.get_feature_names()[i])
    

    Output:

      (0, 14)   1.0
      (0, 35)   1.0
      (0, 43)   1.0
      (0, 50)   1.0
    x0_Man United
    x1_Newcastle
    x2_0
    x3_H