Search code examples
pythonpandasmatplotlibhistogramsubplot

How can I show the distribution of the columns of a dataset, ordering the images in a specific way?


I am trying to show the distribution of the columns of a dataset where I have these variables: ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed'] and from each one a measure, each of these measures is a column of the dataset:

{
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meantemperatureDewPoint', 'mintemperatureDewPoint', 'maxtemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed'].
}

I want to plot them where each row of graphs is a variable and each column of graphs is a measurement.

I tried that:


variable_types = ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']


variable_columns = {
    'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
    'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
    'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
    'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
    'snow': ['meanSnow', 'minSnow', 'maxSnow'],
    'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
    'temperatureDewPoint': ['meanTemperaturedewpoint', 'minTemperaturedewpoint', 'maxTemperaturedewpoint'],
    'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
    'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
    'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
    'windDirection': ['meanWinddirection'],
    'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}



for i, var_type in enumerate(variable_types):
    columns = variable_columns[var_type]
    print(columns)
    fig, axes = plt.subplots(nrows=len(variable_types), ncols=len(columns), figsize=(5, 5))
    axes = axes.flat
    
    
    data = METEO_COMPLETO[columns]
    

    ordered_columns = sorted(columns, key=lambda x: x.split('_')[0])
    print(ordered_columns)

    data[ordered_columns].hist(ax=axes[i], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
    axes[i].set_title(f'Distribución de {var_type}')
    axes[i].set_xlabel('Valor')
    axes[i].set_ylabel('Frecuencia')
    axes[i].legend(ordered_columns)

    plt.tight_layout()
    plt.show()

But I got that result (last 4 variables): enter image description here

Another possible option is to display the available measures in the same graph with different colors to see the distributions of the minimum, mean and maximum value of each variable as a whole.


Solution

  • For me it is easier to keep the "grid-thought" of such a plot arrangement by nesting two for loops, one dealing with the rows of the plot (i.e. "variable"), and one dealing with the columns of the plot (i have named that "statistics").

    However the whole plot gets too large to get nicely formatted, so I guess this is not the best way to present the data in a single plot.

    enter image description here

    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    
    variable_columns = {
        "precip": ["meanPrecip", "minPrecip", "maxPrecip"],
        "pressureChange": ["meanPressurechange", "minPressurechange", "maxPressurechange"],
        "pressureMeanSeaLevel": ["meanPressuremeansealevel", "minPressuremeansealevel", "maxPressuremeansealevel"],
        "relativeHumidity": ["meanRelativehumidity", "minRelativehumidity", "maxRelativehumidity"],
        "snow": ["meanSnow", "minSnow", "maxSnow"],
        "temperature": ["meanTemperature", "minTemperature", "maxTemperature"],
        "temperatureDewPoint": ["meanTemperaturedewpoint", "minTemperaturedewpoint", "maxTemperaturedewpoint"],
        "temperatureFeelsLike": ["meanTemperaturefeelslike", "minTemperaturefeelslike", "maxTemperaturefeelslike"],
        "uvIndex": ["meanUvindex", "maxUvindex", "minUvindex"],
        "visibility": ["meanVisibility", "minVisibility", "maxVisibility"],
        "windDirection": ["meanWinddirection"],
        "windSpeed": ["meanWindspeed", "minWindspeed", "maxWindspeed"],
    }
    
    # generate fake data
    meteo_data = {}
    for cols in variable_columns.values():
        for col in cols:
            meteo_data[col] = np.random.randn(1000)
    METEO_COMPLETO = pd.DataFrame(meteo_data)
    
    # statistics list to get correct plot grid and column name
    statistics = ["max", "mean", "min"]
    fig, axes = plt.subplots(
        nrows=len(variable_columns), ncols=len(statistics), figsize=(5, 5)
    )
    
    # loop over zipped plot rows and variable names
    for i_row, (axes_row, variable) in enumerate(zip(axes, variable_columns.keys())):
        # loop over zipped particular plots in row and respective statistic name
        for ax, stat in zip(axes_row, statistics):
            meteo_column = stat + variable.title()  # capitalize first letter
            try:
                data = METEO_COMPLETO[meteo_column]
            except KeyError:
                # statistics not found (e.g. maxWinddirection
                # => remove axis and go to next plot
                ax.axis("off")
                continue
            ax.hist(data)
            ax.set_title(f"Distribución de {variable}")
            ax.set_xlabel("Valor")
            ax.set_ylabel("Frecuencia")
            ax.legend(col)
            if i_row == 0:
                ax.set_title(stat)
    
    plt.tight_layout()
    plt.show()