I'm trying to connect BigQuery Dataset to Databrick and run Script using Pyspark.
Procedures I've done:
I patched the BigQuery Json API to databrick in dbfs for connection access.
Then I added spark-bigquery-latest.jar in the cluster library and I ran my Script.
When I run this script, I didn't face any error.
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("table","dataset.schema.tablename") \
.load()
df.show()
But Instead of calling a single table in that schema I tried to call all the tables under it using query like:
from pyspark.sql import SparkSession
from google.cloud import bigquery
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
client = bigquery.Client()
table_list = 'dataset.schema'
tables = client.list_tables(table_list)
for table in tables:
tlist = tlist.append(table)
for i in tlist:
sql_query = """select * from `dataset.schema.' + i +'`"""
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("query", sql_query).load()
df.show()
OR
This Script:
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
sql_query = """select * from `dataset.schema.tablename`"""
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("query", sql_query).load()
df.show()
I get this unusual Error:
IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment. Please set a project ID using the builder.
---------------------------------------------------------------------------
IllegalArgumentException Traceback (most recent call last)
<command-131090852> in <module>
35 .option("parentProject", "google-project-ID") \
36 .option("project", "Dataset-Name") \
---> 37 .option("query", sql_query).load()
38 #df.show()
39
/databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
182 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
183 else:
--> 184 return self._df(self._jreader.load())
185
186 @since(1.4)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
131 # Hide where the exception came from that shows a non-Pythonic
132 # JVM exception message.
--> 133 raise_from(converted)
134 else:
135 raise
/databricks/spark/python/pyspark/sql/utils.py in raise_from(e)
IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment. Please set a project ID using the builder.
It do recognize my project ID when I call it as table, but when I run it as query I get this error.
I tried to figure it out and went through many sites for an answer but couldn't get a clear answer for it.
Help is much appreciated... Thanks in Advance...
Can you avoid using queries and just use the table option?
from pyspark.sql import SparkSession
from google.cloud import bigquery
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'dbfs:/FileStore/jars/jarlocation.jar') \
.getOrCreate()
)
client = bigquery.Client()
table_list = 'dataset.schema'
tables = client.list_tables(table_list)
for table in tables:
tlist = tlist.append(table)
for i in tlist:
df = spark.read.format("bigquery").option("credentialsFile", "/dbfs/FileStore/tables/bigqueryapi.json") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("table","dataset.schema." + str(i)) \
.load()
df.show()