I want to return the number of scatter points that occupy a specific area. Normally, I would do this by using a 2dhistogram
and pcolormesh
.
But if I wanted to set bin coordinates that represented irregular sizes that don't represent a grid, how would I do this?
Below is an example of my dataset.
import matplotlib.pyplot as plt
import matplotlib as mpl
import math
import numpy as np
x1 = np.random.randint(80, size=(400, 10))
y1 = np.random.randint(80, size=(400, 10))
x2 = np.random.randint(80, size=(400, 10))
y2 = np.random.randint(80, size=(400, 10))
fig, ax = plt.subplots()
ax.grid(False)
plt.scatter(x1[0],y1[0], c = 'r', zorder = 2)
plt.scatter(x2[0],y2[0], c = 'b', zorder = 2)
ang1 = 0, 50
ang2 = 100, 50
angle = math.degrees(math.acos(5.5/9.15))
xy = 50, 50
Halfway = mpl.lines.Line2D((50,50), (0,100), c = 'white')
arc1 = mpl.patches.Arc(ang1, 65, 100, angle = 0, theta2 = angle, theta1 = 360-angle, lw = 2)
arc2 = mpl.patches.Arc(ang2, 65, 100, angle = 0, theta2 = 180+angle, theta1 = 180-angle, lw = 2)
Oval = mpl.patches.Ellipse(xy, 100, 100, lw = 3, alpha = 0.1)
ax.add_line(Halfway)
ax.add_patch(arc1)
ax.add_patch(arc2)
ax.add_patch(Oval)
plt.text(15, 75, '1', fontsize = 8)
plt.text(35, 90, '2', fontsize = 8)
plt.text(65, 90, '3', fontsize = 8)
plt.text(85, 75, '4', fontsize = 8)
ax.autoscale()
plt.draw()
The bins I want to set are labelled 1-4. Is it possible to set coordinates that return those bins?
If I can set these coordinates, I then want to return the bin that each scatter point occupies. Output:
Update:
If I wanted an export that displayed xy's in each bin for each row in the scatter plot I would write out (x1[0], y1[0])
and transpose the data to return:
1 2 3 4
0 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
Then I would change (x1[0], y1[0])
to (x1[1], y1[1])
to get the second row of data.
1 2 3 4
1 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
Then I would combine those to create:
1 2 3 4
0 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
1 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
I've got 1000's of rows so I'm trying to create a method to use the entire (x1, y1)
to produce the coordinates in each bin for each row of data.
Intended Output:
1 2 3 4
0 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
1 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
2 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
3 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
4 [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)] [(x,y),(x,y)]
5....
6....
If I try (x1, y1)
I get the error:
err = (arc_vertices[:,0] - x)**2 + (arc_vertices[:,1] - y)**2 ValueError: operands could not be broadcast together with shapes (70,) (10,)
I'm really not happy with this approach. Calculating where the y coordinate for a point with your data's x coordinate would fall on the curve seems better.
This approach works similarly, but uses the arc's finite vertices:
arc1v = ax.transData.inverted().transform(arc1.get_verts())
arc2v = ax.transData.inverted().transform(arc2.get_verts())
for (x,y) in zip(x1[0], y1[0]):
err = (arc1v[:,0] - x)**2 + (arc1v[:,1] - y)**2
nearest = (arc1v[err == min(err)])[0]
line_x = (x, nearest[0])
line_y = (y, nearest[1])
ax.add_line(mpl.lines.Line2D(line_x, line_y))
if x > nearest[0]:
ax.scatter(x, y, marker='^', s=100, c='k', zorder=1)
else:
ax.scatter(x, y, marker='v', s=100, c='k', zorder=1)
This "labels" points on the left of the (left) curve with a down-facing triangle and points on the right of it with an up-facing triangle. The lines on the graph point to the nearest defined vertex on the curve and are for illustration only.
You could do this for the other curve as well, and the bin 2/3 division is straightforward.
Here's an example output figure:
Here's a more complete answer:
import matplotlib.pyplot as plt
import matplotlib as mpl
import math
import numpy as np
BIN_23_X = 50 # The separator between bin 2 and 3
x1 = np.random.randint(80, size=(400, 10))
y1 = np.random.randint(80, size=(400, 10))
x2 = np.random.randint(80, size=(400, 10))
y2 = np.random.randint(80, size=(400, 10))
fig, ax = plt.subplots()
ax.grid(False)
plt.scatter(x1[0],y1[0], c = 'r', zorder = 2)
plt.scatter(x2[0],y2[0], c = 'b', zorder = 2)
ang1 = 0, 50
ang2 = 100, 50
angle = math.degrees(math.acos(5.5/9.15))
xy = 50, 50
Halfway = mpl.lines.Line2D((BIN_23_X,BIN_23_X), (0,100), c = 'white')
arc1 = mpl.patches.Arc(ang1, 65, 100, angle = 0, theta2 = angle, theta1 = 360-angle, lw = 2)
arc2 = mpl.patches.Arc(ang2, 65, 100, angle = 0, theta2 = 180+angle, theta1 = 180-angle, lw = 2)
Oval = mpl.patches.Ellipse(xy, 100, 100, lw = 3, alpha = 0.1)
ax.add_line(Halfway)
ax.add_patch(arc1)
ax.add_patch(arc2)
ax.add_patch(Oval)
plt.text(15, 75, '1', fontsize = 8)
plt.text(35, 90, '2', fontsize = 8)
plt.text(65, 90, '3', fontsize = 8)
plt.text(85, 75, '4', fontsize = 8)
# Classification helpers
def get_nearest_arc_vert(x, y, arc_vertices):
err = (arc_vertices[:,0] - x)**2 + (arc_vertices[:,1] - y)**2
nearest = (arc_vertices[err == min(err)])[0]
return nearest
arc1v = ax.transData.inverted().transform(arc1.get_verts())
arc2v = ax.transData.inverted().transform(arc2.get_verts())
def classify_pointset(vx, vy):
bins = {(k+1):[] for k in range(4)}
for (x,y) in zip(vx, vy):
nx1, ny1 = get_nearest_arc_vert(x, y, arc1v)
nx2, ny2 = get_nearest_arc_vert(x, y, arc2v)
if x < nx1: # Is this point in bin 1? To the left of arc1?
bins[1].append((x,y))
elif x > nx2: # Is this point in bin 4? To the right of arc2?
bins[4].append((x,y))
else:
# If we get here, the point is in either bin 2 or 3. We'll consider points
# that fall on the line to be in bin 3.
if x < BIN_23_X: # Is this point to the left BIN_23_X? => Bin 2
bins[2].append((x,y))
else: # Otherwise, the point is in Bin 3
bins[3].append((x,y))
return bins
# Classify points
bins_red = classify_pointset(x1[0], y1[0])
bins_blue = classify_pointset(x2[0], y2[0])
# Display classifications
print("Red:")
for x in bins_red.items():
print(" ", x)
print("Blue:")
for x in bins_blue.items():
print(" ", x)
# "Annotate" classifications
for (x,y) in (bins_red[1] + bins_blue[1]):
ax.scatter(x, y, marker='^', s=100, c='k', zorder=1)
for (x,y) in (bins_red[2] + bins_blue[2]):
ax.scatter(x, y, marker='v', s=100, c='k', zorder=1)
for (x,y) in (bins_red[3] + bins_blue[3]):
ax.scatter(x, y, marker='^', s=100, c='y', zorder=1)
for (x,y) in (bins_red[4] + bins_blue[4]):
ax.scatter(x, y, marker='v', s=100, c='y', zorder=1)
ax.autoscale()
plt.draw()
plt.show()
Produces:
Here, points are "annotated" with shapes behind them corresponding to which bins they were classified into:
Bin Anno. Color Triangle Pointing ------------------------------------------- Bin 1 Black Up Bin 2 Black Down Bin 3 Yellow Up Bin 4 Yellow Down
The code also displays the classification results (the output of classify_pointset
is a dict, keyed on bin number (1-4) with the values being the point coordinates of points found to be in the bin:
Red: (1, [(14, 30), (4, 18), (12, 48)]) (2, [(49, 41)]) (3, [(62, 79), (50, 7), (68, 19), (71, 1), (59, 27), (77, 0)]) (4, []) Blue: (1, [(20, 74), (11, 17), (12, 75)]) (2, [(41, 19), (30, 15)]) (3, [(61, 75)]) (4, [(79, 73), (69, 58), (76, 34), (78, 65)])
You don't have to annotate the figure graphically, it's just there for illustration, you can just use the dicts returned by classify_pointset
(bins_red
and bins_blue
).
The following code produces a list of lists (still 1-indexed), so you can find all the points (both red and blue) in bin 1 by accessing all_points[1]
. The first element (index 0) in the all_points
list is None
, since we're keeping the list 1-indexed.
# Generate a list of lists, the outer index corresponds to the bin number (1-indexed)
all_points = [None] * 5
for bin_key in [1,2,3,4]:
all_points[bin_key] = bins_red[bin_key] + bins_blue[bin_key]
# Just for display.
for bin_key, bin_points in enumerate(all_points):
print(bin_key, bin_points)
Output:
0 None 1 [(1, 8), (16, 72), (23, 67), (12, 19), (24, 51), (24, 47), (15, 23), (18, 51)] 2 [(39, 75), (35, 27), (48, 55), (45, 53), (45, 22)] 3 [(66, 58), (55, 64), (70, 1), (71, 15), (73, 3), (71, 75)] 4 [(74, 62)]