I'm running dask on jupyterlab. I'm trying to save some file in home directory where my python file is stored and it's running properly but I'm not able to find out where my files are getting saved. So I made a folder named output in home directory to save file inside, but when I save file inside it I'm getting following error:
PermissionError: [Errno 13] Permission denied: b'/home/jovyan/Output/20190101_0200'
Here's my try:
from dask_gateway import Gateway
gateway = Gateway(
address="http://traefik-pangeo-dask-gateway/services/dask-gateway",
public_address="https://pangeo.aer-gitlab.com/services/dask-gateway",
auth="jupyterhub",
)
options = gateway.cluster_options()
options
cluster = gateway.new_cluster(
cluster_options=options,
)
cluster.adapt(minimum=10, maximum=50)
client = cluster.get_client()
cluster
client
def get_turb(file, name):
d=[name[0:4],name[4:6],name[6:8],name[9:11],name[11:13]]
f_zip = gzip.decompress(file)
yr=d[0]
mo=d[1]
da=d[2]
hr=d[3]
mn=d[4]
fs = s3fs.S3FileSystem(anon=True)
period = pd.Period(str(yr)+str('-')+str(mo)+str('-')+str(da), freq='D')
# period.dayofyear
dy=period.dayofyear
cc=[7,8,9,10,11,12,13,14,15,16] #look at the IR channels only for now
dat = xr.open_dataset(f_zip)
dd=dat[['recNum','trackLat','trackLon','altitude','maxEDR']]
dd=dd.to_dataframe()
dd = dd.sort_values(by=['maxEDR'])
dd = dd.dropna()
dd['num'] = np.arange(len(dd))
dd.to_csv('Output/edr.csv') <----- Saving the file
edr_files = []
for i in range(2):
print(names[i])
s3_ds = dask.delayed(get_turb)(filedata[i], names[i])
edr_files.append(s3_ds)
edr_files = dask.compute(*edr_files)
Please let me know what I'm doing wrong or what's the possible solution.
And also when I try to save the file on S3 bucket directly by using, following code:
s3.Bucket('temp').upload_file(file_name+'.zip', file_name+'.zip')
It's throwing this error:
distributed.protocol.pickle - INFO - Failed to serialize <function get_temp at 0x7f20a9cb8550>. Exception: cannot pickle '_thread.lock' object
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/worker.py in dumps_function(func)
3319 with _cache_lock:
-> 3320 result = cache_dumps[func]
3321 except KeyError:
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in __getitem__(self, key)
1572 def __getitem__(self, key):
-> 1573 value = super().__getitem__(key)
1574 self.data.move_to_end(key)
/srv/conda/envs/notebook/lib/python3.8/collections/__init__.py in __getitem__(self, key)
1009 return self.__class__.__missing__(self, key)
-> 1010 raise KeyError(key)
1011 def __setitem__(self, key, item): self.data[key] = item
KeyError: <function get_temp at 0x7f20a9cb8550>
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
52 buffers.clear()
---> 53 result = cloudpickle.dumps(x, **dump_kwargs)
54 elif not _always_use_pickle_for(x) and b"__main__" in result:
/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
TypeError: cannot pickle '_thread.lock' object
Try using s3fs instead of boto3 to upload files on S3. That might work.