Search code examples
rustamazon-cloudwatch

How to prevent 408 errors for CloudWatch PutMetricData?


We're using CloudWatch metrics and we are experiencing errors with PutMetricData (via the Rust aws_sdk_cloudwatch crate). I would like to understand why this error is occurring so that I can adjust how I send metrics so that it works successfully.

The debug error looks like the following, after what appears to be a 10 second time out:

Err(ServiceError(ServiceError { source: Unhandled(Unhandled { source: XmlDecodeError { kind: Custom("no root element") }, meta: ErrorMetadata { code: None, message: None, extras: None } }), raw: Response { status: StatusCode(408), headers: Headers { headers: {"content-length": HeaderValue { _private: H0("0") }, "date": HeaderValue { _private: H0("Mon, 09 Sep 2024 20:55:45 GMT") }, "connection": HeaderValue { _private: H0("close") }} }, body: SdkBody { inner: Once(Some(b"")), retryable: true }, extensions: Extensions { extensions_02x: Extensions, extensions_1x: Extensions } } }))

Here is an executable that replicates this error when attempting to publish 125 value/count pairs, but succeeds when truncated to 100. A helper method at the bottom is producing and sending the payloads to PutMetricData, and logging out some basic stats.

use aws_sdk_cloudwatch::config::BehaviorVersion;
use aws_sdk_cloudwatch::types::Dimension;
use aws_sdk_cloudwatch::types::MetricDatum;
use aws_sdk_cloudwatch::types::StandardUnit;
use itertools::Itertools;
use tokio::time::Instant;

#[tokio::main]
async fn main() {
    let aws_config = aws_config::defaults(BehaviorVersion::latest()).load().await;
    let cw_client = aws_sdk_cloudwatch::Client::new(&aws_config);

    let values: Vec<f64> = vec![
        98541.0, 67026.0, 96172.0, 81257.0, 78637.0, 79622.0, 83649.0, 64732.0, 79871.0, 80260.0,
        80244.0, 99961.0, 102978.0, 78736.0, 79897.0, 79404.0, 100405.0, 80017.0, 79846.0, 83623.0,
        236806.0, 95570.0, 80385.0, 96966.0, 81295.0, 80134.0, 80069.0, 99614.0, 83302.0, 97889.0,
        79993.0, 80895.0, 80265.0, 79640.0, 78472.0, 80432.0, 97514.0, 80307.0, 84168.0, 80746.0,
        100452.0, 97142.0, 100383.0, 84088.0, 81623.0, 79927.0, 158122.0, 80490.0, 96406.0,
        96826.0, 96377.0, 80365.0, 81757.0, 97900.0, 78214.0, 80996.0, 78191.0, 63341.0, 80673.0,
        79601.0, 80193.0, 72833.0, 83016.0, 97199.0, 80191.0, 64229.0, 96476.0, 98516.0, 100234.0,
        96348.0, 83757.0, 83779.0, 66457.0, 101027.0, 98412.0, 205606.0, 81428.0, 99704.0, 96764.0,
        80619.0, 79670.0, 80629.0, 80585.0, 82882.0, 79790.0, 96866.0, 98877.0, 99769.0, 80609.0,
        96559.0, 79826.0, 73419.0, 80898.0, 79546.0, 96888.0, 78647.0, 78940.0, 96223.0, 80476.0,
        95708.0, 79921.0, 95619.0, 79476.0, 64980.0, 79883.0, 79628.0, 98414.0, 66497.0, 80538.0,
        98805.0, 81404.0, 65983.0, 78924.0, 83488.0, 96061.0, 77200.0, 98165.0, 81412.0, 62855.0,
        80378.0, 83776.0, 80089.0, 95655.0, 80388.0, 79382.0,
    ];
    let counts: Vec<f64> = vec![
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
    ];
    test(
        "Test 1, Original data",
        vec![(values, counts, "TestMessage")],
        &cw_client,
    )
    .await;

    let values: Vec<f64> = vec![
        98541.0, 67026.0, 96172.0, 81257.0, 78637.0, 79622.0, 83649.0, 64732.0, 79871.0, 80260.0,
        80244.0, 99961.0, 102978.0, 78736.0, 79897.0, 79404.0, 100405.0, 80017.0, 79846.0, 83623.0,
        236806.0, 95570.0, 80385.0, 96966.0, 81295.0, 80134.0, 80069.0, 99614.0, 83302.0, 97889.0,
        79993.0, 80895.0, 80265.0, 79640.0, 78472.0, 80432.0, 97514.0, 80307.0, 84168.0, 80746.0,
        100452.0, 97142.0, 100383.0, 84088.0, 81623.0, 79927.0, 158122.0, 80490.0, 96406.0,
        96826.0, 96377.0, 80365.0, 81757.0, 97900.0, 78214.0, 80996.0, 78191.0, 63341.0, 80673.0,
        79601.0, 80193.0, 72833.0, 83016.0, 97199.0, 80191.0, 64229.0, 96476.0, 98516.0, 100234.0,
        96348.0, 83757.0, 83779.0, 66457.0, 101027.0, 98412.0, 205606.0, 81428.0, 99704.0, 96764.0,
        80619.0, 79670.0, 80629.0, 80585.0, 82882.0, 79790.0, 96866.0, 98877.0, 99769.0, 80609.0,
        96559.0, 79826.0, 73419.0, 80898.0, 79546.0, 96888.0, 78647.0, 78940.0, 96223.0, 80476.0,
        95708.0,
    ];
    let counts: Vec<f64> = vec![
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
    ];
    test(
        "Test 2, truncated to 100 pairs",
        vec![(values.clone(), counts.clone(), "TestMessage")],
        &cw_client,
    )
    .await;

    test(
        "Test 3, two sets, truncated to 100 pairs",
        vec![
            (values.clone(), counts.clone(), "TestMessage"),
            (values, counts, "OtherMessage"),
        ],
        &cw_client,
    )
    .await;
}

async fn test(
    label: &str,
    metrics: Vec<(Vec<f64>, Vec<f64>, &str)>,
    cw_client: &aws_sdk_cloudwatch::Client,
) {
    println!("{label}:");
    let start = Instant::now();
    let total_pairs: usize = metrics
        .iter()
        .map(|(values, _, _)| values.len())
        .sum::<usize>();
    let total_count: f64 = metrics
        .iter()
        .map(|(_, counts, _)| counts.iter().sum::<f64>())
        .sum::<f64>();
    let metric_datums = metrics
        .into_iter()
        .map(|(values, counts, name)| {
            MetricDatum::builder()
                .metric_name("ProcessingTime")
                .dimensions(Dimension::builder().name("MessageName").value(name).build())
                .dimensions(
                    Dimension::builder()
                        .name("ProcessingContext")
                        .value("test")
                        .build(),
                )
                .set_values(Some(values))
                .set_counts(Some(counts))
                .unit(StandardUnit::Milliseconds)
                .build()
        })
        .collect_vec();
    let metric_datum_count = metric_datums.len();

    let res = cw_client
        .put_metric_data()
        .namespace("Playground")
        .set_metric_data(Some(metric_datums))
        .send()
        .await;

    println!("Metric datums: {metric_datum_count}");
    println!("Total value/count pairs: {total_pairs}");
    println!("Total datapoint count: {total_count}");
    println!("Elapsed time: {}ms", start.elapsed().as_millis());
    println!("Result: {res:#?}");
    println!("------");
}

I am confused why errors might be happening, since the docs say:

You can publish either individual values in the Value field, or arrays of values and the number of times each value occurred during the period by using the Values and Counts fields in the MetricData structure. Using the Values and Counts method enables you to publish up to 150 values per metric with one PutMetricData request, and supports retrieving percentile statistics on this data.

Each PutMetricData request is limited to 1 MB in size for HTTP POST requests. You can send a payload compressed by gzip. Each request is also limited to no more than 1000 different metrics.

and I don't believe this should be close to either limit.

The full program output is here:

Test 1: Original data:
Metric datums: 1
Total value/count pairs: 125
Total datapoint count: 128
Elapsed time: 10067ms
Result: Err(
    ServiceError(
        ServiceError {
            source: Unhandled(
                Unhandled {
                    source: XmlDecodeError {
                        kind: Custom(
                            "no root element",
                        ),
                    },
                    meta: ErrorMetadata {
                        code: None,
                        message: None,
                        extras: None,
                    },
                },
            ),
            raw: Response {
                status: StatusCode(
                    408,
                ),
                headers: Headers {
                    headers: {
                        "content-length": HeaderValue {
                            _private: H0(
                                "0",
                            ),
                        },
                        "date": HeaderValue {
                            _private: H0(
                                "Tue, 10 Sep 2024 03:35:31 GMT",
                            ),
                        },
                        "connection": HeaderValue {
                            _private: H0(
                                "close",
                            ),
                        },
                    },
                },
                body: SdkBody {
                    inner: Once(
                        Some(
                            b"",
                        ),
                    ),
                    retryable: true,
                },
                extensions: Extensions {
                    extensions_02x: Extensions,
                    extensions_1x: Extensions,
                },
            },
        },
    ),
)
------
Test 2: truncated to 100 pairs:
Metric datums: 1
Total value/count pairs: 100
Total datapoint count: 103
Elapsed time: 57ms
Result: Ok(
    PutMetricDataOutput {
        _request_id: Some(
            "15911e81-5917-4603-918b-a36161d14ce4",
        ),
    },
)
------
Test 3: two sets, truncated to 100 pairs:
Metric datums: 2
Total value/count pairs: 200
Total datapoint count: 206
Elapsed time: 10018ms
Result: Err(
    ServiceError(
        ServiceError {
            source: Unhandled(
                Unhandled {
                    source: XmlDecodeError {
                        kind: Custom(
                            "no root element",
                        ),
                    },
                    meta: ErrorMetadata {
                        code: None,
                        message: None,
                        extras: None,
                    },
                },
            ),
            raw: Response {
                status: StatusCode(
                    408,
                ),
                headers: Headers {
                    headers: {
                        "content-length": HeaderValue {
                            _private: H0(
                                "0",
                            ),
                        },
                        "date": HeaderValue {
                            _private: H0(
                                "Tue, 10 Sep 2024 03:35:41 GMT",
                            ),
                        },
                        "connection": HeaderValue {
                            _private: H0(
                                "close",
                            ),
                        },
                    },
                },
                body: SdkBody {
                    inner: Once(
                        Some(
                            b"",
                        ),
                    ),
                    retryable: true,
                },
                extensions: Extensions {
                    extensions_02x: Extensions,
                    extensions_1x: Extensions,
                },
            },
        },
    ),
)
------

Solution

  • Disabling request compression fixes this.

    According to the issue I made on the aws-sdk-rust repo:

    This is due to a bug in request compression we introduced back in May. We are looking into it to come up with a fix for it.

    In the meantime, you can workaround the bug by disabling request compression through configuration, i.e., in the above code in Reproduction Steps, change

    aws_config::defaults(BehaviorVersion::latest()).load().await;
    

    to

    let aws_config =
    aws_config::defaults(BehaviorVersion::latest()).disable_request_compression(true).load().await;