Search code examples
pythonscikit-learnpca

scikit-learn PCA transform returns incorrect reduced feature length


I try to apply PCA in my code and when I train my data using the following code:

def gather_train():
    train_data = np.array([])
    train_labels = np.array([])
    with open(training_info, "r") as traincsv:
        for line in traincsv:
            current_image = "train\\{}".format(line.strip().split(",")[0])
            print "Reading data from: {}".format(current_image)
            train_labels = np.append(train_labels, int(line.strip().split(",")[1]))
            with open(current_image, "rb") as img:
                train_data = np.append(train_data, np.fromfile(img, dtype=np.uint8).reshape(-1, 784)/255.0)
    train_data = train_data.reshape(len(train_labels), 784)
    return train_data, train_labels

def get_PCA_train(data):
    print "\nFitting PCA. Components: {} ...".format(PCA_components)
    pca = decomposition.PCA(n_components=PCA_components).fit(data)
    print "\nReducing data to {} components ...".format(PCA_components)
    data_reduced = pca.fit_transform(data)
    return data_reduced

def get_PCA_test(data):
    print "\nFitting PCA. Components: {} ...".format(PCA_components)
    pca = decomposition.PCA(n_components=PCA_components).fit(data)
    print "\nReducing data to {} components ...".format(PCA_components)
    data_reduced = pca.transform(data)
    return data_reduced

def gather_test(imgfile):
    #input is a file, and reads data from it. different from gather_train which gathers all at once
    with open(imgfile, "rb") as img:
        return np.fromfile(img, dtype=np.uint8,).reshape(-1, 784)/255.0

...

train_data = gather_train()
train_data_reduced = get_PCA_train(train_data)
print train_data.ndim, train_data.shape
print train_data_reduced.ndim, train_data_reduced.shape

It prints the ff, which is expected:

2 (1000L, 784L)
2 (1000L, 300L)

But when I begin to reduce my test data:

test_data = gather_test(image_file)
# image_file is 784 bytes (28x28) of pixel values; 1 byte = 1 pixel value
test_data_reduced = get_PCA_test(test_data)
print test_data.ndim, test_data.shape
print test_data_reduced.ndim, test_data_reduced.shape

The output is:

2 (1L, 784L)
2 (1L, 1L)

which results in the error later on:

ValueError: X.shape[1] = 1 should be equal to 300, the number of features at training time

Why is test_data_reduced of shape (1,1), not (1,300)? I have tried using fit_transform for training data and transform for testing data only but still the same error.


Solution

  • The call to PCA has to look roughly like this:

    pca = decomposition.PCA(n_components=PCA_components).fit(train_data)
    data_reduced = pca.transform(test_data)
    

    First you call fit on the training data and then transform on the testing data, you want to reduce.