Search code examples
pysparkapache-spark-sqlpyspark-schema

how to change a column type in array struct by pyspark


how to change a column type in array struct by pyspark, for example, I would like to change userid from int to long

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
        |-- m1: long (nullable = true)
        |-- m2: long (nullable = true)
        |-- m3: struct (nullable = true)
           |-- userid: integer (nullable = true)
 

Solution

  • Would have been useful if you provide a reproducible df as well.

    Following you comments below see the following code.

      sch= StructType([StructField('id', StringType(),False),StructField('numbers', ArrayType(
      StructType([StructField('m1',LongType(),True),
                  StructField('m2',LongType(),True),
                 StructField('m3',StructType([StructField('userid',IntegerType(),True)]),True)])),True)])
    
    
    
    df=spark.createDataFrame([
      ('21',[(1234567, 9876543,(1,))]),
      ('34',[(63467892345, 19523789,(2,))])
    ], schema=sch)
      
      
    
    df.printSchema()
    
    root
     |-- id: string (nullable = false)
     |-- numbers: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- m1: long (nullable = true)
     |    |    |-- m2: long (nullable = true)
     |    |    |-- m3: struct (nullable = true)
     |    |    |    |-- userid: integer (nullable = true)
    

    Solution

    df1 = df.selectExpr(
      "id",
      
      "CAST(numbers AS array<struct<m1:long,m2:long, m3:struct<userid:double>>>) numbers"
    )
    
    df1.printSchema()
    
    root
     |-- id: string (nullable = false)
     |-- numbers: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- m1: long (nullable = true)
     |    |    |-- m2: long (nullable = true)
     |    |    |-- m3: struct (nullable = true)
     |    |    |    |-- userid: double (nullable = true)