Search code examples
pythonnumpypython-imaging-library

ValueError: could not broadcast input array from shape (3024,3024,3) into shape (3024,3024)


I have this code which works fine:

import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import numpy as np
import os
from PIL import Image
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor


# Set directories for generation images and edit images
base_image_dir = os.path.join("IMG_4297.png")
mask_dir = os.path.join("masks")
edit_image_dir = os.path.join("03_edits")

# Point to your downloaded SAM model
sam_model_filepath = "../segment-anything/segment_anything/sam_vit_h_4b8939.pth"
#sam_model_filepath = "./sam_vit_h_4b8939.pth"

# Initiate SAM model
sam = sam_model_registry["default"](checkpoint=sam_model_filepath)

# Function to display mask using matplotlib
def show_mask(mask, ax):
    color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)


# Function to display where we've "clicked"
def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels == 1]
    neg_points = coords[labels == 0]
    ax.scatter(
        pos_points[:, 0],
        pos_points[:, 1],
        color="green",
        marker="*",
        s=marker_size,
        edgecolor="white",
        linewidth=1.25,
    )
    ax.scatter(
        neg_points[:, 0],
        neg_points[:, 1],
        color="red",
        marker="*",
        s=marker_size,
        edgecolor="white",
        linewidth=1.25,
    )


# Load chosen image using opencv
image = cv2.imread("./IMG_4297.png")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Display our chosen image
plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis("on")
plt.show()

# Set the pixel coordinates for our "click" to assign masks
input_point = np.array([[525, 325]])
input_label = np.array([1])

# Display the point we've clicked on
plt.figure(figsize=(10, 10))
plt.imshow(image)
show_points(input_point, input_label, plt.gca())
plt.axis("on")
plt.show()

# Initiate predictor with Segment Anything model
predictor = SamPredictor(sam)
predictor.set_image(image)

# Use the predictor to gather masks for the point we clicked
masks, scores, logits = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    multimask_output=True,
)

# Check the shape - should be three masks of the same dimensions as our image
masks.shape

# Display the possible masks we can select along with their confidence
for i, (mask, score) in enumerate(zip(masks, scores)):
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    show_mask(mask, plt.gca())
    show_points(input_point, input_label, plt.gca())
    plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
    plt.axis("off")
    plt.show()

# Choose which mask you'd like to use
chosen_mask = masks[1]

# We'll now reverse the mask so that it is clear and everything else is white
chosen_mask = chosen_mask.astype("uint8")
chosen_mask[chosen_mask != 0] = 255
chosen_mask[chosen_mask == 0] = 1
chosen_mask[chosen_mask == 255] = 0
chosen_mask[chosen_mask == 1] = 255

# create a base blank mask
width = 1512
height = 1512
mask = Image.new("RGBA", (width, height), (0, 0, 0, 1))  # create an opaque image mask

# Convert mask back to pixels to add our mask replacing the third dimension
pix = np.array(mask)
pix[:, :, 3] = chosen_mask

# Convert pixels back to an RGBA image and display
new_mask = Image.fromarray(pix, "RGBA")
new_mask

# We'll save this mask for re-use for our edit
new_mask.save(os.path.join(mask_dir, "new_mask.png"))

But I am trying to use the second half with a slightly different program / AI language model:

import numpy as np
from lang_sam.utils import draw_image
from PIL import Image
from lang_sam import LangSAM
from heic2png import HEIC2PNG

if __name__ == '__main__':
    heic_img = HEIC2PNG('/Users/Downloads/IMG_4316.heic', quality=70)  # Specify the quality of the converted image
    heic_img.save()  # The converted image will be saved as `test.png`

model = LangSAM()
image_pil = Image.open("/Users/Downloads/IMG_4316.png").convert("RGB")
text_prompt = "wall"
masks, boxes, phrases, logits = model.predict(image_pil, text_prompt)

masks.shape

labels = [f"{phrase} {logit:.2f}" for phrase, logit in zip(phrases, logits)]
image_array = np.asarray(image_pil)
image = draw_image(image_array, masks, boxes, labels)
image = Image.fromarray(np.uint8(image)).convert("RGB")
image.show()

chosen_mask = np.array(image).astype("uint8")
chosen_mask[chosen_mask != 0] = 255
chosen_mask[chosen_mask == 0] = 1
chosen_mask[chosen_mask == 255] = 0
chosen_mask[chosen_mask == 1] = 255

# create a base blank mask
width = 3024    
height = 3024
mask = Image.new("RGBA", (width, height), (0, 0, 0, 1))  # create an opaque image mask

# Convert mask back to pixels to add our mask replacing the third dimension
pix = np.array(mask)
pix[:, :, 3] = chosen_mask

# Convert pixels back to an RGBA image and display
new_mask = Image.fromarray(pix, "RGBA")
new_mask.show()
new_mask.save()

I believe that the problem lies within the format of the converted image on this line:

pix[:, :, 3] = chosen_mask

Is there a transformation or some operation I need to perform on chosen_mask to make to image work here?

The full error is:

> Traceback (most recent call last):
  File "/Users/Desktop/code/lang-segment-anything/app.py", line 112, in <module>
    pix[:, :, 2] = chosen_mask
    ~~~^^^^^^^^^
ValueError: could not broadcast input array from shape (3024,3024,3) into shape (3024,3024)
    ~~~^^^^^^^^^

Solution

  • When you do this:

    width = 3024    
    height = 3024
    mask = Image.new("RGBA", (width, height), (0, 0, 0, 1))  # create an opaque image mask
    
    # Convert mask back to pixels to add our mask replacing the third dimension
    pix = np.array(mask)
    

    you are creating a 3024x3024 image with 4 channels (i.e. RGBA), so your Numpy array pix will have a shape of [3024, 3024, 4].


    When you do this:

    image = Image.fromarray(np.uint8(image)).convert("RGB")
    chosen_mask = np.array(image).astype("uint8")
    

    you make an RGB image with 3 channels (i.e. RGB), so your Numpy array chosen_mask will have a shape of [3024, 3024, 3].


    So, the problem is when you do this:

    pix[:, :, 3] = chosen_mask
    

    you are saying you want to set the Alpha channel at every pixel location in pix to the 3 RGB channels at that location in chosen_mask and that can't work... you cannot put the R and the G and the B channel from chosen_mask into the Alpha channel because there's only one space in the Alpha channel at each location.

    So you either need to make chosen_mask a single channel image by creating it in L mode:

    image = Image.fromarray(np.uint8(image)).convert("L")
    chosen_mask = np.array(image).astype("uint8")
    

    or, you need to chose which one of the RGB channels from chosen_mask it is that you want to put into pix's A channel, e.g. just put the Green channel from chosen_mask into pix's A channel:

    pix[:, :, 3] = chosen_mask[..., 1]