There is a requirement in my project to encrypt some of PII columns data while writing data in a parquet file. To write the data in parquet file, Azure Synapse pyspark notebook is being used.
Not getting any references on the internet. Any leads would be very helpful.
We are using the below implementation. You can try anyone based on your scope
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
class DataFrameMasker:
def __init__(self, df: DataFrame):
self.df = df
def mask_columns(self, columns_to_mask: dict):
masked_df = self.df
for column_name, masking_strategy in columns_to_mask.items():
if masking_strategy == "hash":
masked_df = self._mask_hash(masked_df, column_name)
elif masking_strategy == "random":
masked_df = self._mask_random(masked_df, column_name)
elif masking_strategy=="column_masker":
masked_df = self.mask_rows(masked_df, column_name)
# Add more masking strategies here if needed
return masked_df
def _mask_hash(self, df: DataFrame, column_name: str) -> DataFrame:
masked_df = df.withColumn(column_name, F.sha2(F.col(column_name), 256))
return masked_df
def _mask_random(self, df: DataFrame, column_name: str) -> DataFrame:
masked_df = df.withColumn(column_name, F.expr("substring(md5(cast(rand() as string)), 0, 10)"))
return masked_df
def mask_rows(self, df: DataFrame, columns_name: str) -> DataFrame:
masked_df = df.withColumn(
columns_name,
expr(f"CASE WHEN {columns_name} IS NOT NULL THEN 'xxxxxxx' ELSE NULL END")
)
return masked_df
# Create a sample DataFrame
data = [("Subash", "K", "subashk@example.com"),
("Test", "Message", "testmessage@example.com")]
columns = ["first_name", "last_name", "email"]
df = spark.createDataFrame(data, columns)
# Initialize the DataFrameMasker with the DataFrame
masker = DataFrameMasker(df)
# Define columns to mask and their masking strategy
columns_to_mask = {
"first_name": "hash",
"last_name": "random",
"email": "column_masker"
}
# Mask specified columns
masked_df = masker.mask_columns(columns_to_mask)
# Show the masked DataFrame
masked_df.show()
You can check other masking methods as hashing is a one-way process