I am working on PCA analysis using PySpark as a tool, but I'm having errors due to compatibity of data read from the csv file. What sould I do? would you please help me?
from __future__ import print_function
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import udf
import pandas as pd
import numpy as np
from numpy import array
conf = SparkConf().setAppName("building a warehouse")
sc = SparkContext(conf=conf)
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PCAExample")\
.getOrCreate()
data = sc.textFile('dataset.csv') \
.map(lambda line: line.split(','))\
.collect()
#create a data frame from data read from csv file
df = spark.createDataFrame(data, ["features"])
#convert data to vector udt
df.show()
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)
spark.stop()
here is the error I'm getting:
File "C:/spark/spark-2.1.0-bin-hadoop2.7/bin/pca_bigdata.py", line 38, in <module>
model = pca.fit(df)
pyspark.sql.utils.IllegalArgumentException: u'requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually StringType.'
Here error specifies itself column need to be VectorUDT
instead StringType
. So this will work for you:-
from pyspark.mllib.linalg import SparseVector, VectorUDT
from pyspark.sql.types import StringType, StructField, StructType
df = spark.createDataFrame(data, StructType([
StructField("features", VectorUDT(), True)
]))