Search code examples
pythoncsvpysparkanalysisbigdata

Pca analysis with PySpark


I am working on PCA analysis using PySpark as a tool, but I'm having errors due to compatibity of data read from the csv file. What sould I do? would you please help me?

from __future__ import print_function
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import udf
import pandas as pd
import numpy as np
from numpy import array


conf = SparkConf().setAppName("building a warehouse")
sc = SparkContext(conf=conf)

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PCAExample")\
        .getOrCreate()



   data = sc.textFile('dataset.csv') \
        .map(lambda line:  line.split(','))\
        .collect()
   #create a data frame from data read from csv file 
   df = spark.createDataFrame(data, ["features"])
   #convert data to vector udt

   df.show()


   pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
   model = pca.fit(df)

   result =  model.transform(df).select("pcaFeatures")
   result.show(truncate=False)

   spark.stop()

here is the error I'm getting:

File "C:/spark/spark-2.1.0-bin-hadoop2.7/bin/pca_bigdata.py", line 38, in       <module>
model = pca.fit(df)
pyspark.sql.utils.IllegalArgumentException: u'requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually StringType.'

Solution

  • Here error specifies itself column need to be VectorUDT instead StringType. So this will work for you:-

    from pyspark.mllib.linalg import SparseVector, VectorUDT       
    from pyspark.sql.types import StringType, StructField, StructType
    df = spark.createDataFrame(data, StructType([
                             StructField("features", VectorUDT(), True)
                           ]))