I am trying to show the distribution of the columns of a dataset where I have these variables: ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
and from each one a measure, each of these measures is a column of the dataset:
{
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meantemperatureDewPoint', 'mintemperatureDewPoint', 'maxtemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed'].
}
I want to plot them where each row of graphs is a variable and each column of graphs is a measurement.
I tried that:
variable_types = ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
variable_columns = {
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meanTemperaturedewpoint', 'minTemperaturedewpoint', 'maxTemperaturedewpoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
for i, var_type in enumerate(variable_types):
columns = variable_columns[var_type]
print(columns)
fig, axes = plt.subplots(nrows=len(variable_types), ncols=len(columns), figsize=(5, 5))
axes = axes.flat
data = METEO_COMPLETO[columns]
ordered_columns = sorted(columns, key=lambda x: x.split('_')[0])
print(ordered_columns)
data[ordered_columns].hist(ax=axes[i], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
axes[i].set_title(f'Distribución de {var_type}')
axes[i].set_xlabel('Valor')
axes[i].set_ylabel('Frecuencia')
axes[i].legend(ordered_columns)
plt.tight_layout()
plt.show()
But I got that result (last 4 variables):
Another possible option is to display the available measures in the same graph with different colors to see the distributions of the minimum, mean and maximum value of each variable as a whole.
For me it is easier to keep the "grid-thought" of such a plot arrangement by nesting two for loops, one dealing with the rows of the plot (i.e. "variable"), and one dealing with the columns of the plot (i have named that "statistics").
However the whole plot gets too large to get nicely formatted, so I guess this is not the best way to present the data in a single plot.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
variable_columns = {
"precip": ["meanPrecip", "minPrecip", "maxPrecip"],
"pressureChange": ["meanPressurechange", "minPressurechange", "maxPressurechange"],
"pressureMeanSeaLevel": ["meanPressuremeansealevel", "minPressuremeansealevel", "maxPressuremeansealevel"],
"relativeHumidity": ["meanRelativehumidity", "minRelativehumidity", "maxRelativehumidity"],
"snow": ["meanSnow", "minSnow", "maxSnow"],
"temperature": ["meanTemperature", "minTemperature", "maxTemperature"],
"temperatureDewPoint": ["meanTemperaturedewpoint", "minTemperaturedewpoint", "maxTemperaturedewpoint"],
"temperatureFeelsLike": ["meanTemperaturefeelslike", "minTemperaturefeelslike", "maxTemperaturefeelslike"],
"uvIndex": ["meanUvindex", "maxUvindex", "minUvindex"],
"visibility": ["meanVisibility", "minVisibility", "maxVisibility"],
"windDirection": ["meanWinddirection"],
"windSpeed": ["meanWindspeed", "minWindspeed", "maxWindspeed"],
}
# generate fake data
meteo_data = {}
for cols in variable_columns.values():
for col in cols:
meteo_data[col] = np.random.randn(1000)
METEO_COMPLETO = pd.DataFrame(meteo_data)
# statistics list to get correct plot grid and column name
statistics = ["max", "mean", "min"]
fig, axes = plt.subplots(
nrows=len(variable_columns), ncols=len(statistics), figsize=(5, 5)
)
# loop over zipped plot rows and variable names
for i_row, (axes_row, variable) in enumerate(zip(axes, variable_columns.keys())):
# loop over zipped particular plots in row and respective statistic name
for ax, stat in zip(axes_row, statistics):
meteo_column = stat + variable.title() # capitalize first letter
try:
data = METEO_COMPLETO[meteo_column]
except KeyError:
# statistics not found (e.g. maxWinddirection
# => remove axis and go to next plot
ax.axis("off")
continue
ax.hist(data)
ax.set_title(f"Distribución de {variable}")
ax.set_xlabel("Valor")
ax.set_ylabel("Frecuencia")
ax.legend(col)
if i_row == 0:
ax.set_title(stat)
plt.tight_layout()
plt.show()