Create a dataset from too many images in TensorFlow

I am having trouble finding a way to create a dataset in tensorflow from images. My dataset has the structure below:

    fruit-data
    |
    |-train
    |   |
    |   |- Freshapple -> .png images of fresh apples
    |   |- Freshorange -> .png images of fresh oranges
    |   |- Freshbanana -> .png images of fresh bananas
    |   
    |-test
    |   |
    |   |- Rottenapple -> .png images of rotten apples
    |   |- Rottenorange -> png images of rotten oranges
    |   |- Rottenbanana -> .png images of rotten bananas
    |

I have my paths set as so and the classes set:

    train_path = ".../Desktop/Data/fruit-dataset/train"
    test_path = ".../Desktop/Data/fruit-dataset/train"
    categories = ["freshapple", "freshorange", "freshbanana",
                  "rottenapple", "rottenorange", "rottenbanana"]

From other resources I've seen, because my dataset contains over 13k images, I would need to use flow_from_directory(), as loading into memory would cause a crash at runtime.

I'm confused on what the next steps are to get this dataset loaded in.

For other information, I will be using a tuned MobilenetV2 model. (experimenting with freezing layers)

Solution

there are a number of ways to load the data. I prefer to use pandas dataframes because it is easy to partition the data in various ways. The code below should be what you need

sdir = r'.../Desktop/Data/fruit-dataset'
categories=['train', 'test']
for category in categories:
    catpath=os.path.join(sdir, category)
    classlist=os.listdir(catpath)    
    filepaths=[]
    labels=[]    
    for klass in classlist:
        classpath=os.path.join(catpath,klass)        
        flist=os.listdir(classpath)        
        for f in flist:
            fpath=os.path.join(classpath,f)        
            filepaths.append(fpath)
            labels.append(klass) 
    Fseries=pd.Series(filepaths, name='filepaths')
    Lseries=pd.Series(labels, name='labels')
    if category=='train':
        df=pd.concat([Fseries, Lseries], axis=1)
    else:
        test_df=pd.concat([Fseries, Lseries], axis=1)
# create a validation data set
train_df, valid_df=train_test_split(df, train_size=.8, shuffle=True, random_state=123)
print('train_df length: ', len(train_df), '  test_df length: ',len(test_df), '  valid_df length: ', len(valid_df))
balance=list(train_df['labels'].value_counts())
# check the balance of the training set
for b in balance:
    print (b)
height=224
width=224
channels=3
batch_size=40
img_shape=(height, width, channels)
img_size=(height, width)
length=len(test_df)
test_batch_size=sorted([int(length/n) for n in range(1,length+1) if length % n ==0 and length/n<=80],reverse=True)[0]  
test_steps=int(length/test_batch_size)
print ( 'test batch size: ' ,test_batch_size, '  test steps: ', test_steps)
def scalar(img):
    img=img/255
    return img 
trgen=ImageDataGenerator(preprocessing_function=scalar, horizontal_flip=True)
tvgen=ImageDataGenerator(preprocessing_function=scalar)
train_gen=trgen.flow_from_dataframe( train_df, x_col='filepaths', y_col='labels', target_size=img_size, class_mode='categorical',
                                    color_mode='rgb', shuffle=True, batch_size=batch_size)
test_gen=tvgen.flow_from_dataframe( test_df, x_col='filepaths', y_col='labels', target_size=img_size, class_mode='categorical',
                                    color_mode='rgb', shuffle=False, batch_size=test_batch_size)
valid_gen=tvgen.flow_from_dataframe( valid_df, x_col='filepaths', y_col='labels', target_size=img_size, class_mode='categorical',
                                    color_mode='rgb', shuffle=True, batch_size=batch_size)
classes=list(train_gen.class_indices.keys())
class_count=len(classes)
history=model.fit(x=train_gen,  epochs=20, verbose=2,  validation_data=valid_gen,
               validation_steps=None,  shuffle=False,  initial_epoch=0)

Or a simplier way but less versitile is with flow_from_directory

gen=tf.keras.preprocessing.image.ImageDataGenerator( rescale=1/255, 
                                                     validation_split=0.1)
tgen=tf.keras.preprocessing.image.ImageDataGenerator( rescale=1/255)
train_dir=r'.../Desktop/Data/fruit-dataset/train'
train_gen=gen.flow_from_directoy(train_dir, target_size=(256, 256),
                    class_mode="categorical",  batch_size=32, shuffle=True,
                    seed=123,  subset='training)
valid_gen=gen.flow_from_directory(train_dir, target_size=(256, 256),
                    class_mode="categorical",  batch_size=32, shuffle=True,
                    seed=123,  subset='validation')
test_dir=r'.../Desktop/Data/fruit-dataset/test' # you had this wrong in your code
test_gen=tgen.flow_from_directory(test_dir, target_size=(256, 256),
                    class_mode="categorical",  batch_size=32, shuffle=False)
history=model.fit(x=train_gen,  epochs=20, verbose=2,  validation_data=valid_gen,
               validation_steps=None,  shuffle=False,  initial_epoch=0)