I am having trouble finding a way to create a dataset in tensorflow from images. My dataset has the structure below:
fruit-data
|
|-train
| |
| |- Freshapple -> .png images of fresh apples
| |- Freshorange -> .png images of fresh oranges
| |- Freshbanana -> .png images of fresh bananas
|
|-test
| |
| |- Rottenapple -> .png images of rotten apples
| |- Rottenorange -> png images of rotten oranges
| |- Rottenbanana -> .png images of rotten bananas
|
I have my paths set as so and the classes set:
train_path = ".../Desktop/Data/fruit-dataset/train"
test_path = ".../Desktop/Data/fruit-dataset/train"
categories = ["freshapple", "freshorange", "freshbanana",
"rottenapple", "rottenorange", "rottenbanana"]
From other resources I've seen, because my dataset contains over 13k images, I would need to use flow_from_directory()
, as loading into memory would cause a crash at runtime.
I'm confused on what the next steps are to get this dataset loaded in.
For other information, I will be using a tuned MobilenetV2
model. (experimenting with freezing layers)
there are a number of ways to load the data. I prefer to use pandas dataframes because it is easy to partition the data in various ways. The code below should be what you need
sdir = r'.../Desktop/Data/fruit-dataset'
categories=['train', 'test']
for category in categories:
catpath=os.path.join(sdir, category)
classlist=os.listdir(catpath)
filepaths=[]
labels=[]
for klass in classlist:
classpath=os.path.join(catpath,klass)
flist=os.listdir(classpath)
for f in flist:
fpath=os.path.join(classpath,f)
filepaths.append(fpath)
labels.append(klass)
Fseries=pd.Series(filepaths, name='filepaths')
Lseries=pd.Series(labels, name='labels')
if category=='train':
df=pd.concat([Fseries, Lseries], axis=1)
else:
test_df=pd.concat([Fseries, Lseries], axis=1)
# create a validation data set
train_df, valid_df=train_test_split(df, train_size=.8, shuffle=True, random_state=123)
print('train_df length: ', len(train_df), ' test_df length: ',len(test_df), ' valid_df length: ', len(valid_df))
balance=list(train_df['labels'].value_counts())
# check the balance of the training set
for b in balance:
print (b)
height=224
width=224
channels=3
batch_size=40
img_shape=(height, width, channels)
img_size=(height, width)
length=len(test_df)
test_batch_size=sorted([int(length/n) for n in range(1,length+1) if length % n ==0 and length/n<=80],reverse=True)[0]
test_steps=int(length/test_batch_size)
print ( 'test batch size: ' ,test_batch_size, ' test steps: ', test_steps)
def scalar(img):
img=img/255
return img
trgen=ImageDataGenerator(preprocessing_function=scalar, horizontal_flip=True)
tvgen=ImageDataGenerator(preprocessing_function=scalar)
train_gen=trgen.flow_from_dataframe( train_df, x_col='filepaths', y_col='labels', target_size=img_size, class_mode='categorical',
color_mode='rgb', shuffle=True, batch_size=batch_size)
test_gen=tvgen.flow_from_dataframe( test_df, x_col='filepaths', y_col='labels', target_size=img_size, class_mode='categorical',
color_mode='rgb', shuffle=False, batch_size=test_batch_size)
valid_gen=tvgen.flow_from_dataframe( valid_df, x_col='filepaths', y_col='labels', target_size=img_size, class_mode='categorical',
color_mode='rgb', shuffle=True, batch_size=batch_size)
classes=list(train_gen.class_indices.keys())
class_count=len(classes)
history=model.fit(x=train_gen, epochs=20, verbose=2, validation_data=valid_gen,
validation_steps=None, shuffle=False, initial_epoch=0)
Or a simplier way but less versitile is with flow_from_directory
gen=tf.keras.preprocessing.image.ImageDataGenerator( rescale=1/255,
validation_split=0.1)
tgen=tf.keras.preprocessing.image.ImageDataGenerator( rescale=1/255)
train_dir=r'.../Desktop/Data/fruit-dataset/train'
train_gen=gen.flow_from_directoy(train_dir, target_size=(256, 256),
class_mode="categorical", batch_size=32, shuffle=True,
seed=123, subset='training)
valid_gen=gen.flow_from_directory(train_dir, target_size=(256, 256),
class_mode="categorical", batch_size=32, shuffle=True,
seed=123, subset='validation')
test_dir=r'.../Desktop/Data/fruit-dataset/test' # you had this wrong in your code
test_gen=tgen.flow_from_directory(test_dir, target_size=(256, 256),
class_mode="categorical", batch_size=32, shuffle=False)
history=model.fit(x=train_gen, epochs=20, verbose=2, validation_data=valid_gen,
validation_steps=None, shuffle=False, initial_epoch=0)