I am currently developing a Python script to retrieve a comprehensive list of all the jobs that were executed yesterday. However, I'm encountering an issue with the script's pagination mechanism using tokens. Despite my attempts to loop through the pagination process, the resulting output remains unchanged.
import requests
import pandas as pd
import math
import datetime
import json
def fetch_and_process_job_runs(base_uri, api_token, params):
endpoint = '/api/2.1/jobs/runs/list'
headers = {'Authorization': f'Bearer {api_token}'}
all_data = [] # To store all the data from multiple pages
while True:
# print(params)
response = requests.get(base_uri + endpoint, headers=headers, params=params)
response_json = response.json()
data = []
for run in response_json["runs"]:
start_time_ms = run["start_time"]
start_time_seconds = start_time_ms / 1000
start_time_readable = datetime.datetime.fromtimestamp(start_time_seconds).strftime('%Y-%m-%d %H:%M:%S')
data.append({
"job_id": run["job_id"],
"creator_user_name": run["creator_user_name"],
"run_name": run["run_name"],
"run_page_url": run["run_page_url"],
"run_id": run["run_id"],
"execution_duration_in_mins": math.ceil(int(run.get('execution_duration')) / (1000 * 60)),
"result_state": run["state"].get("result_state"),
"start_time": start_time_readable
})
all_data.extend(data)
df = pd.DataFrame(all_data)
print(df)
if response_json.get("has_more") == True:
next_page_token = response_json.get("next_page_token")
params['next_page_token'] = next_page_token
else:
break
df = pd.DataFrame(all_data)
return df
# Replace with your actual values
now = datetime.datetime.utcnow()
yesterday = now - datetime.timedelta(days=1)
start_time_from = int(yesterday.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()) * 1000
start_time_to = int(yesterday.replace(hour=23, minute=59, second=59, microsecond=999999).timestamp()) * 1000
params = {
# "start_time_from": start_time_from,
# "start_time_to": start_time_to,
"expand_tasks": True
}
baseURI = 'https://adb-xxxxxxxxxxxxxx.azuredatabricks.net'
apiToken = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
result_df = fetch_and_process_job_runs(baseURI, apiToken, params)
print(result_df)
Please help me.
I noticed that the value of next_token wasn't changing in API response and then figured that you have a very small error in your code. The parameter to be passed in request is page_token
and not next_page_token
.
As per documentation at https://docs.databricks.com/api/workspace/jobs/list,
page_token string
Use next_page_token
or prev_page_token
returned from the previous request to list the next or previous page of jobs respectively.
So params['next_page_token']
needs to change to params['page_token']