I made a function which can plot statistics for large arrays (10**8)
less than 2 seconds. How can I scale Y-axis
to make area under the graph equal to 1?
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins[-1] = bins[-2]
bins = bins[2:]
hist_height = np.max(bins)
edges = np.linspace(x_min, x_max, bin_count+1)
mean = df.mean(df.x)
std = df.std(df.x)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.step(edges, bins, where='post', color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
x = np.random.normal(0, 1, (10**8, ))
Complete answer if somebody wants to now how to plot big data statistics:
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins = bins[2:-1]
edges = np.linspace(x_min, x_max, bin_count+1)
left, right = edges[:-1], edges[1:]
edges = np.reshape(np.array([left,right]).T, [-1])
bins = np.reshape(np.array([bins,bins]).T, [-1])
mean = df.mean(df.x)
std = df.std(df.x)
# Scale AUC to 1
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
hist_height = np.max(bins)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.fill_between(edges, bins, step="pre", alpha=0.3)
plt.plot(edges, bins, color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
To moderators: this site doesn't alow me to post code even if it is the answer: It looks like your post is mostly code; please add some more details.
The idea is to normalise your data set, ie to divide the height of each column by the AUC (area under curve) of your histogram.
Before "plt.step(...)" write:
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
hope that could help