This is the code needed to reproduce the decision tree classifier tree that gives far too many values to interpret the graph, I would like to avoid the overt array of values for a more simple value array if possible. Most of this code is needed for the processing of the dataset before attempting to plot the tree.
import numpy as np
import pandas as pd
df = pd.read_csv("")
df.drop([ 'CSR Number', 'Address House Number', 'Address Street Direction',
'Address Street Name', 'Address Street Suffix',
'Parcel Identification Number (PIN)','Address House Fraction Number',
'Address Street Suffix Direction', 'Case Number Related to CSR'],
axis=1, inplace=True)
# Drop any row found with an NA value
df.dropna(axis=0, how='any', inplace=True)
# Observed date columns
date_columns = ['Date Received', 'Date Closed', 'Due Date']
# Function to reformat date string
str_2_date = lambda date: f"{date[:6]}2{re.split('/', date)[2][1:]}"
# Apply said function to the date columns in the dataframe
df = df.apply(lambda column: df[].apply(str_2_date) if in date_columns else column)
for column in date_columns:
original_dtype = str(df[column].dtypes)
df[column] = pd.to_datetime(df[column])
new_dtype = str(df[column].dtypes)
print("{:<20} {:<20} {:<20}".format(column, original_dtype, new_dtype))
for column in date_columns:
df[f"{column} Day of Week"] = df[column].dt.dayofweek # Monday=0, Sunday=6.
df[f"{column} Month"] = df[column].dt.month
df[f"{column} Year"] = df[column].dt.year
# Remove original date columns
df.drop(date_columns, axis=1, inplace=True)
df['Lat.'] = [literal_eval(x)[0] for x in df['Latitude/Longitude']]
df['Lon.'] = [literal_eval(x)[1] for x in df['Latitude/Longitude']]
df.drop('Latitude/Longitude', axis=1, inplace=True)
# Encode the rest of the columns having dtype 'object' using ordinal encoding
object_columns = df.dtypes[(df.dtypes == "object")].index.tolist()
for column in object_columns:
values_list = df[column].value_counts(ascending=True).index.tolist()
ordinal_map = {value:(index + 1) for index, value in enumerate(values_list)}
df[column] = df[column].map(ordinal_map)
def sincos(x, period):
radians = (2 * np.pi * x) / period
return np.column_stack((np.sin(radians), np.cos(radians)))
# Encode the day of week columns
day_of_week_columns = df.filter(like='Day of Week', axis=1).columns.tolist()
for column in day_of_week_columns:
day_sc = sincos(df[column], 7)
df[f"{column} Sin"] = day_sc[:,0]
df[f"{column} Cosine"] = day_sc[:,1]
# Encode the month columns
month_columns = df.filter(like='Month', axis=1).columns.tolist()
for column in month_columns:
month_sc = sincos(df[column], 12)
df[f"{column} Sin"] = day_sc[:,0]
df[f"{column} Cosine"] = day_sc[:,1]
date_info_columns = day_of_week_columns + month_columns
df.drop(date_info_columns, axis=1, inplace=True)
num_na = df.isna().sum().sum()
num_rows, num_cols = df.shape
# Below is the decision tree plot that gives unwanted array of values, is there a way to avoid this???
from sklearn.tree import export_graphviz
import graphviz # needed for the graph
predictors = ['LADBS Inspection District', 'Address Street Zip', 'Date Received Year',
'Date Closed Year', 'Due Date Year', 'Case Flag', 'CSR Priority',
'Lat.', 'Lon.'] # features to predict from
# we must pass np arrays into our decision tree
X = df[predictors].values # numpy array for predictor variables
y = df['Response Days'].values # numpy array for target variable
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state=0)
clf = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
# Using Decision Tree classifier model, and fit the model with the training data
dot_data = export_graphviz(clf, precision=3,
filled=False, rounded=True,
# plot it
graph = graphviz.Source(dot_data)
Your target variable, Response Days
, has lots of unique values, so using a classifier means each leaf keeps track of how many samples of each are there, hence the long lists. You probably would rather use a regression model, and if you do that the reported value of each leaf is just the (single!) average target value among samples.