Search code examples
pythonpython-3.xdaskdask-distributeddask-delayed

Where does dask store files while running on juputerlab


I'm running dask on jupyterlab. I'm trying to save some file in home directory where my python file is stored and it's running properly but I'm not able to find out where my files are getting saved. So I made a folder named output in home directory to save file inside, but when I save file inside it I'm getting following error:

PermissionError: [Errno 13] Permission denied: b'/home/jovyan/Output/20190101_0200'

Here's my try:

from dask_gateway import Gateway
gateway = Gateway(
    address="http://traefik-pangeo-dask-gateway/services/dask-gateway",
    public_address="https://pangeo.aer-gitlab.com/services/dask-gateway",
    auth="jupyterhub",
)
options = gateway.cluster_options()
options

cluster = gateway.new_cluster(
    cluster_options=options,
)
cluster.adapt(minimum=10, maximum=50)
client = cluster.get_client()
cluster
client

def get_turb(file, name):
    
    d=[name[0:4],name[4:6],name[6:8],name[9:11],name[11:13]] 
    f_zip = gzip.decompress(file)

    yr=d[0]
    mo=d[1]
    da=d[2]
    hr=d[3]
    mn=d[4]
    
    fs = s3fs.S3FileSystem(anon=True)

    period = pd.Period(str(yr)+str('-')+str(mo)+str('-')+str(da), freq='D')
    # period.dayofyear
    dy=period.dayofyear

    cc=[7,8,9,10,11,12,13,14,15,16]  #look at the IR channels only for now
    dat = xr.open_dataset(f_zip)
    dd=dat[['recNum','trackLat','trackLon','altitude','maxEDR']]
    dd=dd.to_dataframe()
    dd = dd.sort_values(by=['maxEDR'])
    dd = dd.dropna()
    dd['num'] = np.arange(len(dd))
    dd.to_csv('Output/edr.csv') <----- Saving the file

edr_files = []

for i in range(2):
    print(names[i])
    s3_ds = dask.delayed(get_turb)(filedata[i], names[i])
    edr_files.append(s3_ds)

edr_files = dask.compute(*edr_files)

Please let me know what I'm doing wrong or what's the possible solution.

And also when I try to save the file on S3 bucket directly by using, following code:

    s3.Bucket('temp').upload_file(file_name+'.zip', file_name+'.zip')

It's throwing this error:

distributed.protocol.pickle - INFO - Failed to serialize <function get_temp at 0x7f20a9cb8550>. Exception: cannot pickle '_thread.lock' object
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/worker.py in dumps_function(func)
   3319         with _cache_lock:
-> 3320             result = cache_dumps[func]
   3321     except KeyError:

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in __getitem__(self, key)
   1572     def __getitem__(self, key):
-> 1573         value = super().__getitem__(key)
   1574         self.data.move_to_end(key)

/srv/conda/envs/notebook/lib/python3.8/collections/__init__.py in __getitem__(self, key)
   1009             return self.__class__.__missing__(self, key)
-> 1010         raise KeyError(key)
   1011     def __setitem__(self, key, item): self.data[key] = item

KeyError: <function get_temp at 0x7f20a9cb8550>

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
     52                 buffers.clear()
---> 53                 result = cloudpickle.dumps(x, **dump_kwargs)
     54         elif not _always_use_pickle_for(x) and b"__main__" in result:

/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
     72             )
---> 73             cp.dump(obj)
     74             return file.getvalue()

/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
    562         try:
--> 563             return Pickler.dump(self, obj)
    564         except RuntimeError as e:

TypeError: cannot pickle '_thread.lock' object


Solution

  • Try using s3fs instead of boto3 to upload files on S3. That might work.