Hybrid deep learning model combining backbone model and handcrafted features

I have RGB images and I'd like to build a regression model to predict 'Lodging_score' combining densenet121 as backbone and handcrafted features in a csv file. Running my script below, I got the following error ValueError: Layer "model" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, None, None, None) dtype=float32>]. I would appreciate if you could help me out, I've been struggling for days.

#Step 1: Import the required libraries  
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

modelID = 'd121_HCF'

#Step 2: Load and preprocess the image data 
image_dir = r'/path_to_images_folder'
annotations_file = '/path_to/annotation.csv'
features_file = 'handcrafted_features.csv'

# Load image filenames and labels from annotations file
annotations_df = pd.read_csv(annotations_file)

image_filenames = annotations_df['Image_filename'].tolist()
labels = annotations_df['Lodging_score'].tolist()

# Load handcrafted features
features_df = pd.read_csv(features_file)
features_df.set_index('Image_filename', inplace=True)

# Get common image filenames
common_filenames = list(set(image_filenames).intersection(features_df.index))
#print(len(common_filenames))

# Filter the annotation and feature dataframes based on common filenames
annotations_df = annotations_df[annotations_df['Image_filename'].isin(common_filenames)]
features_df = features_df.loc[common_filenames]
features_df = features_df.drop(columns=['plot_id','project_id','Lodging_score'])# dropping columns that are not features

# Split the data into train, val, and test sets
train_filenames, test_filenames, train_labels, test_labels = train_test_split(
    annotations_df['Image_filename'].tolist(),
    annotations_df['Lodging_score'].tolist(),
    test_size=0.2,
    random_state=42)

val_filenames, test_filenames, val_labels, test_labels = train_test_split(
    test_filenames,
    test_labels,
    test_size=0.5,
    random_state=42)

# Preprocess handcrafted features
train_features = features_df.loc[train_filenames].values
val_features = features_df.loc[val_filenames].values
test_features = features_df.loc[test_filenames].values

# Normalize handcrafted features
train_features = (train_features - train_features.mean(axis=0)) / train_features.std(axis=0)
val_features = (val_features - train_features.mean(axis=0)) / train_features.std(axis=0)
test_features = (test_features - train_features.mean(axis=0)) / train_features.std(axis=0)

# Convert the label arrays to numpy arrays
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

# Preprocess handcrafted features
train_features = train_features[:len(train_filenames)]
val_features = val_features[:len(val_filenames)]
test_features = test_features[:len(test_filenames)]

# Define image data generator with augmentations
image_size = (75, 200)
batch_size = 32

image_data_generator = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True)

train_data = pd.DataFrame({'filename': train_filenames, 'Lodging_score': train_labels})
train_generator = image_data_generator.flow_from_dataframe(
    train_data,
    directory=image_dir,
    x_col='filename',
    y_col='Lodging_score',
    target_size=image_size,
    batch_size=batch_size,
    class_mode='raw',
    shuffle=False)

val_generator = image_data_generator.flow_from_dataframe(
    pd.DataFrame({'filename': val_filenames, 'Lodging_score': val_labels}),
    directory=image_dir,
    x_col='filename',
    y_col='Lodging_score',
    target_size=image_size,
    batch_size=batch_size,
    class_mode='raw',
    shuffle=False)

# Create test generator
test_generator = image_data_generator.flow_from_dataframe(
    pd.DataFrame({'filename': test_filenames, 'Lodging_score': test_labels}),
    directory=image_dir,
    x_col='filename',
    y_col='Lodging_score',
    target_size=image_size,
    batch_size=batch_size,  # Keep the batch size the same as the other generators
    class_mode='raw',
    shuffle=False)

#Step 3: Build the hybrid regression model
# Load DenseNet121 pre-trained on ImageNet without the top layer
base_model = DenseNet121(include_top=False, weights='imagenet', input_shape=image_size + (3,))

# Freeze the base model's layers
base_model.trainable = False

# Input layers for image data and handcrafted features
image_input = Input(shape=image_size + (3,))
features_input = Input(shape=(train_features.shape[1],))

# Preprocess image input for DenseNet121
image_preprocessed = tf.keras.applications.densenet.preprocess_input(image_input)

# Extract features from the base model
base_features = base_model(image_preprocessed, training=False)
base_features = GlobalAveragePooling2D()(base_features)

# Combine base model features with handcrafted features
combined_features = Concatenate()([base_features, features_input])

# Add dense layers for regression
x = Dropout(0.5)(combined_features)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='linear')(x)

# Create the model
model = Model(inputs=[image_input, features_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

#Step 4: Train the model with early stopping   
# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True)

# Convert numpy arrays to tensors
train_features_tensor = tf.convert_to_tensor(train_features, dtype=tf.float32)
val_features_tensor = tf.convert_to_tensor(val_features, dtype=tf.float32)
test_features_tensor = tf.convert_to_tensor(test_features, dtype=tf.float32)

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=50,
    validation_data=([val_generator.next()[0], val_features], val_labels),
    validation_steps=len(val_generator),
    callbacks=[early_stopping])

# Evaluate the model on the test set
loss = model.evaluate([test_generator.next()[0], test_features], test_labels, verbose=0)
predictions = model.predict([test_generator.next()[0], test_features])

Solution

Look into tf.Dataset. Here you can see how to read them from a DataFrame. Then, either do the preprocessing with Dataset.map(), mapping a preprocessing function onto each element, or use preprocessing layers like in this guide. You can use these layers also for augmentation.

Because you have two input layers, your data must have this input too. Example code:

import numpy as np
import pandas as pd
import tensorflow as tf


img_paths = ['test'] * 100  # 100 image paths here
rand_features = np.random.rand(100, 3)  # random features
rand_labels = np.random.randint(0, 10, size=(100, 1))  # random labels as int

#right now, the dataset has a 3-tuple as samples
ds = tf.data.Dataset.from_tensor_slices((img_paths, rand_features, rand_labels))
ds = ds.map(lambda x, y, z: ((tf.image.decode_image(tf.io.read_file(x)), y), z))
ds = ds.batch(32)  # batch the data

In the ds.map(...) call, for each tuple-element it reads an image from the image_path string and transforms the (image, feature, label) 3-tuple to an ((image, feature), label) nested 2-tuple. Now, model.fit can accept each (image, feature) as 2 inputs (x), and label as (y). Put this after your image and feature reading from the dataframes. You can also take a look at other handy Dataset methods like .prefetch() or .shuffle().

Edit: Incorporating the Dataset into your code. I omitted the first part of your code, just let it be. I started with some of your code to show you where to insert.

# [your previous code here]

# Preprocess handcrafted features
train_features = train_features[:len(train_filenames)]
val_features = val_features[:len(val_filenames)]
test_features = test_features[:len(test_filenames)]

# Define image data generator with augmentations
image_size = (75, 200)
batch_size = 32

# -----------------------------------
# creating the datasets here
train_ds = tf.data.Dataset.from_tensor_slices((train_filenames, train_features, train_labels))
train_ds = ds.map(lambda x, y, z: ((tf.image.decode_image(tf.io.read_file(x)), y), z))
# this utilized datasets fully
train_ds = train_ds.cache().shuffle(10000).batch(batch_size).prefetch(2)
val_ds = tf.data.Dataset.from_tensor_slices((val_filenames, val_features, val_labels))
val_ds = ds.map(lambda x, y, z: ((tf.image.decode_image(tf.io.read_file(x)), y), z))
val_ds = val_ds.cache().batch(batch_size).prefetch(2)
test_ds = tf.data.Dataset.from_tensor_slices((test_filenames, test_features, test_labels))
test_ds = test_ds.map(lambda x, y, z: ((tf.image.decode_image(tf.io.read_file(x)), y), z))
test_ds = test_ds.batch(batch_size).prefetch(2)

augment_ = tf.keras.layers.RandomRotation(20)(
               tf.keras.layers.RandomTranslation(height_factor=0.1, width_factor0.1)(
                   tf.keras.layers.RandomFlip(mode='horizontal')
              )
           )

#Step 3: Build the hybrid regression model
# Load DenseNet121 pre-trained on ImageNet without the top layer
base_model = DenseNet121(include_top=False, weights='imagenet', input_shape=image_size + (3,))

# Freeze the base model's layers
base_model.trainable = False

# Input layers for image data and handcrafted features
image_input = Input(shape=image_size + (3,))
features_input = Input(shape=(train_features.shape[1],))

# Preprocess image input for DenseNet121
image_preprocessed = tf.keras.applications.densenet.preprocess_input(image_input)
image_preprocessed = augment_(image_preprocessed)
# --------------------------

You don't have to (or better, must not) do the /255. on your images, the densenet.preprocess_input expects input to be in [0, 255] range. The random augmentation from ImageDataGenerator are now layers and called with augment_(). This is a bit better than your approach, as augmentation layers are turned off for val and test data.

A word on train_ds.cache().shuffle(10000).batch(batch_size).prefetch(2):
These methods level the efficiency of the Dataset class. The order here is not arbitrary. If you would shuffle before caching, the shuffled dataset is cached and never shuffled again. You also want to shuffle() before batch(), else you shuffle the batches, but the items of each batch are fixed. prefetch(x) pre-loads x items for faster training. If you put it before batch(), it will only load x samples, after batch() it will pre-load x batches of samples (what we want). val and test set are not shuffled, as they don't need that. test is also not cached, as caching is expensive, buth faster on subsequent runs, but normally test is run only one time anyway.

Also note, that in the docs of DenseNet121 it is stated that it expect images of shape (224, 224, 3), so test it if it works correctly with your images. Also, I did not test the code because of missing data.