Search code examples
pandasnumpydeep-learningpredictionsequential

Numpy.where not returning the expected output in the predicted dataset


I have trained this model on a dataset. Although the accuracy is low but that's not my concern yet. My question is that when I add a new column(in the end) by the name df['predict], why doesn't it return the predicted output in the dataset, yet when I run df['predict], I get the output.

import numpy as np
import pandas as pd 

df1 = pd.DataFrame(np.random.randint(1,33,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])
df2 = pd.DataFrame(np.random.randint(34,41,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])
df3 = pd.DataFrame(np.random.randint(42,53,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])
df4 = pd.DataFrame(np.random.randint(54,66,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])
df5 = pd.DataFrame(np.random.randint(67,88,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])
df6 = pd.DataFrame(np.random.randint(89,100,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])
df7 = pd.DataFrame(np.random.randint(90,100,  size =(10000,5)), columns = ['s1','s2','s3','s4','s5'])

df = pd.concat([df1,df2,df3,df4,df5,df6,df7])

df['marks obtained'] = df.sum(axis = 1)

df['Total'] = 500

df['percentage'] = (df['marks obtained']/df['Total'])*100

def grade(x):
if x >= 80:
    return 'A+'
if x >= 70:
    return 'A'
if x >= 60:
    return 'B'
if x >= 50:
    return 'C'
if x >= 40:
    return 'D'
if x >= 33:
    return 'E'
else:
    return 'fail'

df['grade'] = df['percentage'].apply(grade)

dic = {'A+': 1, 'A': 2, 'B': 3, 'C': 4, 'D':5, 'E': 6, 'fail': 7}

df['grade1'] = df['grade'].map(dic)

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, SGD

x = df.loc[:,'s1':'s5'].to_numpy()
y = df['grade']

y = pd.get_dummies(y).to_numpy()

model = Sequential()
model.add(Dense(5, activation = 'relu', input_shape =(5,)))
model.add(Dense(7, activation = 'softmax'))

model.compile(optimizer = SGD(lr=0.8), loss=  'categorical_crossentropy', metrics = ['acc'])

model.fit(x,y,epochs = 30)

def true(q):
for i in q:
    if i == 1:
        print('A+')
    if i == 2:
        print('A')
    if i == 3:
        print('B')
    if i == 4:
        print('C')
    if i == 5:
        print('D')
    if i == 6:
        print('E')            
        
w = np.argmax(model.predict(x), axis = 1)

df['predict'] = np.where(np.argmax(model.predict(x), axis = 1), true(w), 'fail')

OUTPUT:

df['predict'] = np.where(np.argmax(model.predict(x), axis = 1), true(w), 'fail') returns the following:

A+
A+
A+
A+
A+
.
.
.
.

But when I print out the data set it returns:

s1      s2  s3  s4  s5  marks obtained  Total   percentage  grade   grade1  predict
0       2   23  9   23  2       59      500       11.8      fail      7      None
1       1   4   6   12  5       28      500        5.6      fail      7      None
2       17  20  26  24  13     100      500       20.0      fail      7      None
3       18  16  4   19  13      70      500       14.0      fail      7      None
4       22  30  21  19  9      101      500       20.2      fail      7      None
... ... ... ... ... ...    ...      ...       ...       ...     ...      ...
9995    90  94  97  91  91      463     500       92.6       A+       1      None
9996    90  94  96  90  96      466     500       93.2       A+       1      None
9997    93  92  99  93  92      469     500       93.8       A+       1      None
9998    98  98  99  93  92      480     500       96.0       A+       1      None
9999    93  95  97  93  97      475     500       95.0       A+       1      None
70000 rows × 11 columns

Solution

  • The definition of true should return a variable instead of printing it.

    def true(q):
        for i in q:
            if i == 1:
                return('A+')
            if i == 2:
                return('A')
            if i == 3:
                return('B')
            if i == 4:
                return('C')
            if i == 5:
                return('D')
            if i == 6:
                return('E')  
    
    df['predict'] = np.where(np.argmax(model.predict(x), axis = 1), true(w), 'fail')
    

    Also on a side note, you can use the softmax layer to convert the output into probability values.