Search code examples
pythonmachine-learningdata-scienceknn

Different output for same function


I am implemented a KNN algorithm in python.

import math

            #height,width,deepth,thickness,Label
data_set = [(2,9,8,4, "Good"),
            (3,7,7,9, "Bad"),
            (10,3,10,3, "Good"),
            (2,9,6,10, "Good"),
            (3,3,2,5, "Bad"),
            (2,8,5,6, "Bad"),
            (7,2,3,10, "Good"),
            (1,10,8,10, "Bad"),
            (2,8,1,10, "Good")
            ]


A = (3,2,1,5)
B = (8,3,1,2)
C = (6,10,8,3)
D = (9,6,4,1)


distances = []
labels = []

def calc_distance(datas,test):
    for data in datas:
        distances.append(
            ( round(math.sqrt(((data[0] - test[0])**2 + (data[1] - test[1])**2 + (data[2] - test[2])**2 + (data[3] - test[3])**2)), 3), data[4] )) 
    return distances

def most_frequent(list1): 
    return max(set(list1), key = list1.count) 

def get_neibours(k):
    distances.sort()
    print(distances[:k])
    for distance in distances[:k]:
        labels.append(distance[1])
    print("It can be classified as: ", end="")
    print(most_frequent(labels))



calc_distance(data_set,D)
get_neibours(7)

calc_distance(data_set,D)
get_neibours(7)

I works well mostly and I get the correct label. For example for D, i do get the label "Good". However i discovered a bug that when I call it twice for example:

 calc_distance(data_set,D)
get_neibours(7)

calc_distance(data_set,D)
get_neibours(7)

and I run it few times, i get different outputs- "Good" and "Bad" when I run the program couple of times.. enter image description here

There must be a bug somewhere I am unable to find out.


Solution

  • The problem is that you are using the same distances and label, sorting and getting the k first elements. Create the lists inside the functions and return it. Check the modifications bellow.

    import math
    
    data_set = [
        (2,9,8,4, "Good"),
        (3,7,7,9, "Bad"),
        (10,3,10,3, "Good"),
        (2,9,6,10, "Good"),
        (3,3,2,5, "Bad"),
        (2,8,5,6, "Bad"),
        (7,2,3,10, "Good"),
        (1,10,8,10, "Bad"),
        (2,8,1,10, "Good"),
    ]
    
    A = (3,2,1,5)
    B = (8,3,1,2)
    C = (6,10,8,3)
    D = (9,6,4,1)
    
    def calc_distance(datas, test):
        distances = []
        for data in datas:
            distances.append(
                ( round(math.sqrt(((data[0] - test[0])**2 + (data[1] - test[1])**2 + (data[2] - test[2])**2 + (data[3] - test[3])**2)), 3), data[4] ))
        return distances
    
    def most_frequent(list1):
        return max(set(list1), key = list1.count)
    
    def get_neibours(distances, k):
        labels = []
        distances.sort()
        print(distances[:k])
        for distance in distances[:k]:
            labels.append(distance[1])
        print("It can be classified as: ", end="")
        print(most_frequent(labels))
    
    distances = calc_distance(data_set,D)
    get_neibours(distances, 7)
    
    distances = calc_distance(data_set,D)
    get_neibours(distances, 7) 
    

    [(7.071, 'Good'), (8.062, 'Bad'), (8.888, 'Bad'), (9.11, 'Good'), (10.1, 'Good'), (10.488, 'Bad'), (11.958, 'Good')] It can be classified as: Good

    [(7.071, 'Good'), (8.062, 'Bad'), (8.888, 'Bad'), (9.11, 'Good'), (10.1, 'Good'), (10.488, 'Bad'), (11.958, 'Good')] It can be classified as: Good