Search code examples
pythontensorflowkerasprediction

Why Keras model always predict the same class?


My project is used for classifying the phishing link or not, so I extract the features of those links. I trained a CNN Keras model for a binary classification, and get 0.92 accuracy. But when I use that model to predict the class, the model always get the same class 0. I don't know what's wrong with my model. Please help me.

#importing basic packages
import pandas as pd
import numpy as np
#import xgboost
import sys

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from SingleURLFeatureExtraction import addfeature
from createcnn_model import create_model

import argparse
import math
import time
import copy

import keras
from keras.models import Sequential, Model
from keras import initializers
from keras import regularizers
from keras.layers.core import Dense
from keras.layers import Dropout, Embedding, Flatten, Activation, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils.np_utils import to_categorical
#XGBoost Classification model
#from xgboost import XGBClassifier


#Loading the data
#def detect_url():  #check url 

#url= 'http://zozo.jp/shop/bestpackingstore/?price=proper&p_ssy=2015&p_ssm=5&p_ssd=13&p_sey=2015&p_sem=5&p_sed=13&dstk=2'
#url1="http://www.google.com"

def build_model():
    data0 = pd.read_csv('urldata.csv')
    batch_size_v = 64;
    epochs_v = 4;
#Dropping the Domain column
    data = data0.drop(['Domain'], axis = 1).copy()
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
    data = data.sample(frac=1).reset_index(drop=True)
#data.head()
    y = data['Label']
    X = data.drop('Label',axis=1)
    print(X.transpose())
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)    
    model = create_model()
# instantiate the model
    model.fit(X_train, y_train,
          batch_size=batch_size_v,
          epochs=epochs_v,  # only 2 epochs, for demonstration purposes
          verbose=1,
          validation_data=(X_test, y_test))

    score = model.evaluate(X_test, y_test, verbose=1)    
    return model
    
    

feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']
data1 = pd.read_csv("phishing.csv")
data1 = pd.DataFrame(data1, columns= feature_names)

data1 = data1.drop(['Domain'], axis = 1).copy()
data1 = data1.drop(['Label'], axis = 1).copy()

for i in range(0, 5000):
    x =  data1.iloc[i]
    y = pd.DataFrame(x.values.reshape(1,16))     
    predict_x=model.predict(y)
    classes_x=np.argmax(predict_x,axis=1)
    print(classes_x)

  

Solution

  • Use np.where(predict_x > 0.5, 1, 0) instead of np.argmax.