I'm using Google Cloud DLP to inspect sensitive data in BigQuery. I wonder is it possible to inspect all tables within a dataset with one dlpJob? If so, how should I set the configs?
I tried to omit the BQ tableId field in config. But it will return http 400 error "table_id
must be set". Does it mean that with one dlpJob, only one table can be inspected, and to scan multiple tables we need multiple dlpJobs? Or is there a way to scan multiple tables within the same dataset with some regex tricks?
At the moment, one job just scans one table. The team is working on that feature - in the meantime you can manually create jobs with a rough shell script like what I've put below which combines gcloud and the rest calls to the dlp api. You could probably do something a lot smoother with cloud functions.
Prerequisites:
1. Install gcloud. https://cloud.google.com/sdk/install
2. Run this script with the following arguments:
3.
1. The project_id to scan bigquery tables of.
2. The dataset id for the output table to store findings to.
3. The table id for the output table to store findings to.
4. A number that represents the percentage of rows to scan.
# Example: # ./inspect_all_bq_tables.sh dlapi-test findings_daataset
# Reports a status of execution message to the log file and serial port
function report() {
local tag="${1}"
local message="${2}"
local timestamp="$(date +%s)000"
echo "${timestamp} - ${message}"
}
readonly -f report
# report_status_update
#
# Reports a status of execution message to the log file and serial port
function report_status_update() {
report "${MSGTAG_STATUS_UPDATE}" "STATUS=${1}"
}
readonly -f report_status_update
# create_job
#
# Creates a single dlp job for a given bigquery table.
function create_dlp_job {
local dataset_id="$1"
local table_id="$2"
local create_job_response=$(curl -s -H \
"Authorization: Bearer $(gcloud auth print-access-token)" \
-H "X-Goog-User-Project: $PROJECT_ID" \
-H "Content-Type: application/json" \
"$API_PATH/v2/projects/$PROJECT_ID/dlpJobs" \
--data '
{
"inspectJob":{
"storageConfig":{
"bigQueryOptions":{
"tableReference":{
"projectId":"'$PROJECT_ID'",
"datasetId":"'$dataset_id'",
"tableId":"'$table_id'"
},
"rowsLimitPercent": "'$PERCENTAGE'"
},
},
"inspectConfig":{
"infoTypes":[
{
"name":"ALL_BASIC"
}
],
"includeQuote":true,
"minLikelihood":"LIKELY"
},
"actions":[
{
"saveFindings":{
"outputConfig":{
"table":{
"projectId":"'$PROJECT_ID'",
"datasetId":"'$FINDINGS_DATASET_ID'",
"tableId":"'$FINDINGS_TABLE_ID'"
},
"outputSchema": "BASIC_COLUMNS"
}
}
},
{
"publishFindingsToCloudDataCatalog": {}
}
]
}
}')
if [[ $create_job_response != *"dlpJobs"* ]]; then
report_status_update "Error creating dlp job: $create_job_response"
exit 1
fi
local new_dlpjob_name=$(echo "$create_job_response" \
head -5 | grep -Po '"name": *\K"[^"]*"' | tr -d '"' | head -1)
report_status_update "DLP New Job: $new_dlpjob_name"
}
readonly -f create_dlp_job
# List the datasets for a given project. Once we have these we can list the
# tables within each one.
function create_jobs() {
# The grep pulls the dataset id. The td removes the quotation marks.
local list_datasets_response=$(curl -s -H \
"Authorization: Bearer $(gcloud auth print-access-token)" -H \
"Content-Type: application/json" \
"$BIGQUERY_PATH/projects/$PROJECT_ID/datasets")
if [[ $list_datasets_response != *"kind"* ]]; then
report_status_update "Error listing bigquery datasets: $list_datasets_response"
exit 1
fi
local dataset_ids=$(echo $list_datasets_response \
| grep -Po '"datasetId": *\K"[^"]*"' | tr -d '"')
# Each row will look like "datasetId", with the quotation marks
for dataset_id in ${dataset_ids}; do
report_status_update "Looking up tables for dataset $dataset_id"
local list_tables_response=$(curl -s -H \
"Authorization: Bearer $(gcloud auth print-access-token)" -H \
"Content-Type: application/json" \
"$BIGQUERY_PATH/projects/$PROJECT_ID/datasets/$dataset_id/tables")
if [[ $list_tables_response != *"kind"* ]]; then
report_status_update "Error listing bigquery tables: $list_tables_response"
exit 1
fi
local table_ids=$(echo "$list_tables_response" \
| grep -Po '"tableId": *\K"[^"]*"' | tr -d '"')
for table_id in ${table_ids}; do
report_status_update "Creating DLP job to inspect table $table_id"
create_dlp_job "$dataset_id" "$table_id"
done
done
}
readonly -f create_jobs
PROJECT_ID=$1
FINDINGS_DATASET_ID=$2
FINDINGS_TABLE_ID=$3
PERCENTAGE=$4
API_PATH="https://dlp.googleapis.com"
BIGQUERY_PATH="https://www.googleapis.com/bigquery/v2"
# Main
create_jobs