javascript tensorflow object-detection yolov7

How to correctly render YoloV7 boxes on image?

I'm using a TensorFlow.js implementation of YoloV7 object detection, but can't wrap my head around how to correctly scale and deal with ratios of bounding boxes. I get the correct spatial output in [x1, y1, width, height] format, but I don't get the exact locations correct, because the model input image is 640x640:

[
    [
        371.74334716796875,
        101.12919616699219,
        19.002845764160156,
        39.24892807006836
    ],
    [
        45.18428421020508,
        181.0949249267578,
        66.98155212402344,
        74.84469604492188
    ],
    [
        405.8454284667969,
        239.25437927246094,
        92.49766540527344,
        278.452880859375
    ],
    [
        292.5257873535156,
        264.4200744628906,
        77.10870361328125,
        263.32293701171875
    ],
    [
        102.06099700927734,
        249.17529296875,
        71.49154663085938,
        326.78228759765625
    ],
    [
        200.99879455566406,
        256.24462890625,
        118.14222717285156,
        311.6120910644531
    ]
]

export const renderBoxes = (
    canvasRef,
    classThreshold,
    boxes_data,
    scores_data,
    classes_data,
    ratios
) => {
    const ctx = canvasRef.getContext("2d");
    ctx.clearRect(0, 0, ctx.canvas.width, ctx.canvas.height); // clean canvas

    const colors = new Colors();

    // font configs
    const font = `${Math.max(
        Math.round(Math.max(ctx.canvas.width, ctx.canvas.height) / 40),
        14
    )}px Arial`;
    ctx.font = font;
    ctx.textBaseline = "top";

    for (let i = 0; i < scores_data.length; ++i) {
        const klass = labels[classes_data[i]];
        const score = (scores_data[i] * 100).toFixed(1);
  
        let [x1, y1, x2, y2] = xywh2xyxy(boxes_data[i]);

        const width = x2 - x1;
        const height = y2 - y1;
  
        // Draw the bounding box.
        ctx.strokeStyle = "#B033FF";
        ctx.lineWidth = 2;
        ctx.strokeRect(x1, y1, width, height);
  
        // Draw the label background.
        ctx.fillStyle = "#B033FF";
        const textWidth = ctx.measureText(klass + " - " + score + "%").width;
        const textHeight = parseInt(font, 10); // base 10
        ctx.fillRect(x1 - 1, y1 - (textHeight + 2), textWidth + 2, textHeight + 2);
  
        // Draw labels
        ctx.fillStyle = "#ffffff";
        ctx.fillText(klass + " - " + score + "%", x1 - 1, y1 - (textHeight + 2));
    }
};
const preprocess = (source: HTMLImageElement, modelWidth: number, modelHeight: number) => {
    let xRatio, yRatio; // ratios for boxes

    const input = tf.tidy(() => {
        const img = tf.browser.fromPixels(source);

        // padding image to square => [n, m] to [n, n], n > m
        const [h, w] = img.shape.slice(0, 2); // get source width and height
        const maxSize = Math.max(w, h); // get max size
        const imgPadded = img.pad([
        [0, maxSize - h], // padding y [bottom only]
        [0, maxSize - w], // padding x [right only]
        [0, 0],
        ]);

        xRatio = maxSize / w; // update xRatio
        yRatio = maxSize / h; // update yRatio

        return tf.image
        .resizeBilinear(imgPadded, [modelWidth, modelHeight]) // resize frame
        .div(255.0) // normalize
        .transpose([2, 0, 1]) // ??
        .expandDims(0); // add batch
    });

    return [input, xRatio, yRatio];
};

const MODEL_URL = '/yolov7_web_model/model.json';
const model = await tf.loadGraphModel(MODEL_URL);
const model_dim = [640, 640];

const myImage = document.getElementById('image');
const [input, xRatio,yRatio] = preprocess(myImage, 640, 640);
const execution = model.execute(input);

const result = execution.arraySync()[0];
var detections = non_max_suppression(result);
const boxes =  shortenedCol(detections, [0,1,2,3]);
const scores = shortenedCol(detections, [4]);
const class_detect = shortenedCol(detections, [5]);

renderBoxes(canvasRef.value!, 0.2, boxes, scores, class_detect, [xRatio, yRatio]);

I've tried using things like xRatio and yRatio but I don't understand how I would scale not just the ratios but also the size. How do I scale this so that it can be rendered for the original 1356x904 image?

Solution

Canvas scaling solution:

You have to reverse the initial letterboxing that is done in order to achieve a square image for the model, and then scale that to the image dimensions.

// Initial scale for letterbox image -> model
ctx.scale(1, Math.max(...ratios));

// Post-Process scale for model output -> actual dimensions
const horizontalScaleFactor = imageWidth / 640;
const verticalScaleFactor = imageHeight / 640;
ctx.scale(horizontalScaleFactor, verticalScaleFactor)

export const renderBoxes = (
    canvasRef,
    classThreshold,
    boxes_data,
    scores_data,
    classes_data,
    ratios,
    imageWidth,
    imageHeight
) => {
    const ctx = canvasRef.getContext("2d");
    ctx.clearRect(0, 0, ctx.canvas.width, ctx.canvas.height); // clean canvas

    ctx.canvas.width = imageWidth;
    ctx.canvas.height = imageHeight;

    // Initial scale for letterbox image -> model
    ctx.scale(1, Math.max(...ratios));

    // Post-Process scale for model output -> actual dimensions
    const horizontalScaleFactor = imageWidth / 640;
    const verticalScaleFactor = imageHeight / 640;
    ctx.scale(horizontalScaleFactor, verticalScaleFactor)
    console.log({horizontalScaleFactor, verticalScaleFactor})

    // font configs
    const font = `${14}px Arial`;
    ctx.font = font;
    ctx.textBaseline = "top";

    const colors = new Colors();

    for (let i = 0; i < scores_data.length; ++i) {
        const klass = labels[classes_data[i]];
        const color = colors.get(classes_data[i]);
        const score = (scores_data[i] * 100).toFixed(1);
  
        let [x1, y1, x2, y2] = xywh2xyxy(boxes_data[i]);

        const width = x2 - x1;
        const height = y2 - y1;
  
        // Draw the bounding box.
        ctx.strokeStyle = Colors.hexToRgba(color, 0.5);
        ctx.lineWidth = 2;
        ctx.strokeRect(x1, y1, width, height);
  
        // Draw the label background.
        ctx.fillStyle = color;
        const textWidth = ctx.measureText(klass + " - " + score + "%").width;
        const textHeight = parseInt(font, 10); // base 10
        ctx.fillRect(x1 - 1, y1 - (textHeight + 2), textWidth + 2, textHeight + 2);
  
        // Draw labels
        ctx.fillStyle = "#ffffff";
        ctx.fillText(klass + " - " + score + "%", x1 - 1, y1 - (textHeight + 2));
    }
};