Search code examples
visualizationvega-lite

How to use a rule mark on the quartiles of the x axis to display the corresponding y axis value


Problem

I have a distribution of IDs each with a ratio value. I'd like to use vega-lite to display the cumulative number of IDs that are at or below a given ratio. I'd also like to visually display the quartile boundaries on the x-axis with text that show the corresponding y-axis value.

I've gotten stuck on figuring out how to display the quartile boundaries and the corresponding ratio values. Fundamentally it's a question of how to aggregate the data.

Example Images

Where I'm at, using a combination of window and joinaggregate transforms. It displays the area and quartiles but no text.

What I want, using fake data. It displays the area, quartiles, and text.

Any help would be appreciated.

Current code

Where I'm at using transforms

vega editor

{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "data": {
    "values": "id,ratio\n60002630,1\n316206200,0.5\n4565522,0\n93144670,1\n141932256,0.6\n121391313,0\n555653204,0.2\n161737495,0.2\n729578137,0\n312874324,1\n528974830,0.88\n112110130,0.59\n704594409,0.98\n508668347,1\n792158072,0.92\n352656341,0.8\n743410995,0.74\n730375561,0\n25880207,1\n113747911,0.53\n810723939,0\n492518402,0.9\n228644637,0\n614945175,1\n212949973,0\n196410138,0\n819806082,1\n564411345,0.82\n974981427,0\n119489543,0.71\n984017203,0.5",
    "format": {"type": "csv"}
  },
  "width": 500,
  "transform": [
    {
      "sort": [{"field": "ratio"}],
      "window": [{"op": "row_number", "as": "Cumulative Id Count"}]
    },
    {
      "joinaggregate": [
        {"op": "q1", "field": "ratio", "as": "ratio_p25"},
        {"op": "median", "field": "ratio", "as": "ratio_p50"},
        {"op": "q3", "field": "ratio", "as": "ratio_p75"},
        {"op": "q1", "field": "Cumulative Id Count", "as": "cid_count_p25"},
        {"op": "median", "field": "Cumulative Id Count", "as": "cid_count_p50"},
        {"op": "q3", "field": "Cumulative Id Count", "as": "cid_count_p75"}
      ]
    }
  ],
  "layer": [
    {
      "mark": "area",
      "encoding": {
        "y": {"field": "ratio", "type": "quantitative"},
        "x": {
          "field": "Cumulative Id Count",
          "type": "quantitative",
          "scale": {"nice": false}
        }
      }
    },
    {
      "mark": "rule",
      "encoding": {"x": {"field": "cid_count_p25", "type": "quantitative"}}
    },
    {
      "mark": "rule",
      "encoding": {"x": {"field": "cid_count_p50", "type": "quantitative"}}
    },
    {
      "mark": "rule",
      "encoding": {"x": {"field": "cid_count_p75", "type": "quantitative"}}
    }
  ]
}

Using fake data to show what I'd like to have

vega editor

{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "data": {
    "values": "id,ratio\n60002630,1\n316206200,0.5\n4565522,0\n93144670,1\n141932256,0.6\n121391313,0\n555653204,0.2\n161737495,0.2\n729578137,0\n312874324,1\n528974830,0.88\n112110130,0.59\n704594409,0.98\n508668347,1\n792158072,0.92\n352656341,0.8\n743410995,0.74\n730375561,0\n25880207,1\n113747911,0.53\n810723939,0\n492518402,0.9\n228644637,0\n614945175,1\n212949973,0\n196410138,0\n819806082,1\n564411345,0.82\n974981427,0\n119489543,0.71\n984017203,0.5",
    "format": {"type": "csv"}
  },
  "width": 500,
  "transform": [
    {
      "sort": [{"field": "ratio"}],
      "window": [{"op": "row_number", "as": "Cumulative Id Count"}]
    }
  ],
  "datasets": {
    "fake-combined-data": [
      {"Cumulative Id Count": 8, "ratio": 1, "label": 0},
      {"Cumulative Id Count": 16, "ratio": 1, "label": 0.6},
      {"Cumulative Id Count": 23, "ratio": 1, "label": 0.95}
    ]
  },
  "encoding": {
    "y": {"field": "ratio", "type": "quantitative"},
    "x": {
      "field": "Cumulative Id Count",
      "type": "quantitative",
      "scale": {"nice": false}
    },
    "text": {"type": "nominal", "field": "label"}
  },
  "layer": [
    {"mark": "area"},
    {"data": {"name": "fake-combined-data"}, "mark": {"type": "rule"}},
    {"data": {"name": "fake-combined-data"}, "mark": {"type": "text", "dy": -5}}
  ]
}

Solution

  • Is this what you want? Wasn't sure if you wanted label or value as the text so I have shown both. In your example, you are plotting dozens of lines BTW so I have aggregated to a single line.

    enter image description here

    {
      "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
      "data": {
        "values": "id,ratio\n60002630,1\n316206200,0.5\n4565522,0\n93144670,1\n141932256,0.6\n121391313,0\n555653204,0.2\n161737495,0.2\n729578137,0\n312874324,1\n528974830,0.88\n112110130,0.59\n704594409,0.98\n508668347,1\n792158072,0.92\n352656341,0.8\n743410995,0.74\n730375561,0\n25880207,1\n113747911,0.53\n810723939,0\n492518402,0.9\n228644637,0\n614945175,1\n212949973,0\n196410138,0\n819806082,1\n564411345,0.82\n974981427,0\n119489543,0.71\n984017203,0.5",
        "format": {"type": "csv"}
      },
      "width": 500,
      "transform": [
        {
          "sort": [{"field": "ratio"}],
          "window": [{"op": "row_number", "as": "Cumulative Id Count"}]
        },
        {
          "joinaggregate": [
            {"op": "q1", "field": "ratio", "as": "ratio_p25"},
            {"op": "median", "field": "ratio", "as": "ratio_p50"},
            {"op": "q3", "field": "ratio", "as": "ratio_p75"},
            {"op": "q1", "field": "Cumulative Id Count", "as": "cid_count_p25"},
            {"op": "median", "field": "Cumulative Id Count", "as": "cid_count_p50"},
            {"op": "q3", "field": "Cumulative Id Count", "as": "cid_count_p75"}
          ]
        }
      ],
      "layer": [
        {
          "mark": "area",
          "encoding": {
            "y": {"field": "ratio", "type": "quantitative"},
            "x": {
              "field": "Cumulative Id Count",
              "type": "quantitative",
              "scale": {"nice": false}
            }
          }
        },
        {
          "mark": {"type": "text", "text": "cid_count_p25", "dy": -10},
          "encoding": {
            "x": {"field": "cid_count_p25", "aggregate": "max"},
            "y": {"datum": 1}
          }
        },
        {
          "mark": "rule",
          "encoding": {
            "x": {
              "field": "cid_count_p25",
              "type": "quantitative",
              "aggregate": "max"
            }
          }
        },
         {
          "mark": {"type": "text", "dy": -10},
          "encoding": {
            "x": {"field": "cid_count_p50", "aggregate": "max"},
            "y": {"datum": 1}, "text":{"field": "cid_count_p50"}
          }
        },
        {
          "mark": "rule",
          "encoding": {
            "x": {
              "field": "cid_count_p50",
              "type": "quantitative",
              "aggregate": "max"
            }
          }
        },
            {
          "mark": {"type": "text", "text": "cid_count_p75", "dy": -10},
          "encoding": {
            "x": {"field": "cid_count_p75", "aggregate": "max"},
            "y": {"datum": 1}
          }
        },
        {
          "mark": "rule",
          "encoding": {
            "x": {
              "field": "cid_count_p75",
              "type": "quantitative",
              "aggregate": "max"
            }
          }
        }
      ]
    }