My problem is trying to link scipy api with pyspark using UDF
columns = ['N','P']
data = [(1,3),
(3,3),
(5,3)]
df = spark.createDataFrame(data=data,schema=columns)
+---+---+
|N |P |
+---+---+
|1 |3 |
|3 |3 |
|5 |3 |
+---+---+
@pandas_udf("col1 int, col2 int")
def func(s1: pd.Series, s2: pd.Series) -> pd.Series:
import scipy
s3 = s1 + s2*scipy.pi
return s3
The desired output after using this function is a new column added with the transformation thanks in advance
df2 = df.withColumn('transformed',func("N", "P"))
>>> from pyspark.sql.functions import pandas_udf
>>> from pyspark.sql.types import FloatType
>>> import pandas as pd
>>>
>>> @pandas_udf(FloatType())
... def p_udf(s1: pd.Series, s2: pd.Series) -> pd.Series:
... return s1 + (s2 * 3.14)
...
>>>
>>> df = spark.createDataFrame(data=[(1,3), (3,3), (5,3)], schema=['N','P'])
>>> df.withColumn('transformed', p_udf("N", "P")).show()
+---+---+-----------+
| N| P|transformed|
+---+---+-----------+
| 1| 3| 10.42|
| 3| 3| 12.42|
| 5| 3| 14.42|
+---+---+-----------+
>>>