I am trying to train a model on GCP with keras and tensorflow 1.15. From now my code is similar to what I could do on colab, namely :
# TPUs
import tensorflow as tf
print(tf.__version__)
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver("tpu-name")
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
print("Number of accelerators: ", tpu_strategy.num_replicas_in_sync)
import numpy as np
np.random.seed(123) # for reproducibility
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Input
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist, cifar10
from tensorflow.keras.models import Model
# 4. Load data into train and test sets
(X_train, y_train) = load_data(sets="gs://BUCKETS/dogscats/train/",target_size=img_size)
(X_test, y_test) = load_data(sets="gs://BUCKETS/dogscats/valid/",target_size=img_size)
print(X_train.shape, X_test.shape)
# 5. Preprocess input data
#X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
#X_test = X_test.reshape(X_test.shape[0], 28, 28,1)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255.0
X_test /= 255.0
print(y_train.shape, y_test.shape)
# 6. Preprocess class labels One hot encoding
Y_train = utils.to_categorical(y_train, 2)
Y_test = utils.to_categorical(y_test, 2)
print(Y_train.shape, Y_test.shape)
with tpu_strategy.scope():
model = make_model((img_size, img_size, 3))
# 8. Compile model
model.compile(loss='categorical_crossentropy',
optimizer="sgd",
metrics=['accuracy'])
model.summary()
batch_size = 1250 * tpu_strategy.num_replicas_in_sync
# 9. Fit model on training data
model.fit(X_train, Y_train, steps_per_epoch=len(X_train)//batch_size,
epochs=5, verbose=1)
But my data is on the bucket and my code is on an VM. So what I have to do ? I tried to load my data using "gs://BUCKETS" but it does not work. What should I do ? EDIT : I add my code to load data, I forgot it sorry.
def load_data(sets="dogcats/train/", k = 5000, target_size=250):
# define location of dataset
folder = sets
photos, labels = list(), list()
# determine class
output = 0.0
for i, dog in enumerate(listdir(folder + "dogs/")):
if i >= k:
break
# load image
photo = load_img(folder + "dogs/" +dog, target_size=(target_size, target_size))
# convert to numpy array
photo = img_to_array(photo)
# store
photos.append(photo)
labels.append(output)
output = 1.0
for i, cat in enumerate(listdir(folder + "cats/") ):
if i >= k:
break
# load image
photo = load_img(folder + "cats/"+cat, target_size=(target_size, target_size))
# convert to numpy array
photo = img_to_array(photo)
# store
photos.append(photo)
labels.append(output)
# convert to a numpy arrays
photos = asarray(photos)
labels = asarray(labels)
print(photos.shape, labels.shape)
photos, labels = shuffle(photos, labels, random_state=0)
return photos, labels
EDIT2 : To complete the answer of @daudnadeem in case some other people are in the same case.
My goal was to get images from a bucket, so the code works well and allowed to get byte object. To transform it into image you just need to use PIL library:
from PIL import Image
from io import BytesIO
import numpy as np
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket("BUCKETS")
blob = bucket.get_blob('dogscats/train/<you-will-need-to-point-to-a-file-and-not-a-directory>')
data = blob.download_as_string()
img = Image.open(BytesIO(data))
img = np.array(img)
(X_train, y_train) = load_data(sets="gs://BUCKETS/dogscats/train/",target_size=img_size)
(X_test, y_test) = load_data(sets="gs://BUCKETS/dogscats/valid/",target_size=img_size)
This obviously won't work since essentially all you've done is given sets a string. What you need to do is download this data as a string, and then use that.
First install the package pip install google-cloud-storage
or pip3 install google-cloud-storage
pip -> Python
pip3 -> Python3
Have a look at this, you will need a service account to interact with GCP from your code. For authentication purposes.
When you get your service account as a json, you need to do one of two things:
Set it as an env variable:
export GOOGLE_APPLICATION_CREDENTIALS="/home/user/Downloads/[FILE_NAME].json"
or my preferrable workaround
gcloud auth activate-service-account \
<repalce-with-email-from-json-file> \
--key-file=<path/to/your/json/file> --project=<name-of-your-gcp-project>
Now lets look at how you can use google-cloud-storage library to download your file as a string:
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket("BUCKETS")
blob = bucket.get_blob('/dogscats/train/<you-will-need-to-point-to-a-file-and-not-a-directory>')
data = blob.download_as_string()
Now that you have your data as a string, you can simply pass data
into load data like so (X_train, y_train) = load_data(sets=data,target_size=img_size)
It sounds complex but heres a quick psuedo layout:
load_data(data)
Hope that helps!