I have a dataframe df
:
import pandas as pd
data = {
"latitude": [49.5659508, 49.568089, 49.5686342, 49.5687609, 49.5695834, 49.5706579, 49.5711228, 49.5716422, 49.5717749, 49.5619579, 49.5619579, 49.5628938, 49.5628938, 49.5630028, 49.5633175, 49.56397639999999, 49.566359, 49.56643220000001, 49.56643220000001, 49.5672061, 49.567729, 49.5677449, 49.5679685, 49.5679685, 49.5688543, 49.5690616, 49.5713705],
"longitude": [10.9873409, 10.9894035, 10.9896749, 10.9887881, 10.9851579, 10.9853273, 10.9912959, 10.9910182, 10.9867083, 10.9995758, 10.9995758, 11.000319, 11.000319, 10.9990996, 10.9993819, 11.004145, 11.0003023, 10.9999593, 10.9999593, 10.9935709, 11.0011213, 10.9954016, 10.9982288, 10.9982288, 10.9975928, 10.9931367, 10.9939141],
"cluster": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2],
"dup_location_count": [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0]
}
df = pd.DataFrame(data)
df.head(11)
latitude longitude cluster dup_location_count
0 49.565951 10.987341 0 0
1 49.568089 10.989403 0 0
2 49.568634 10.989675 0 0
3 49.568761 10.988788 0 0
4 49.569583 10.985158 0 0
5 49.570658 10.985327 0 0
6 49.571123 10.991296 0 0
7 49.571642 10.991018 0 0
8 49.571775 10.986708 0 0
9 49.561958 10.999576 1 2
10 49.561958 10.999576 1 2
The columns latitude
and longitude
represent the spatial coordinates of people. The column cluster
represents the cluster. People who live at the same building or close to each other are usually in the same cluster. Each cluster has a cluster size of 9 people. The column dup_location_count
represents the number of other people who share exactly the same coordinates.
I am looking for an automatic way to move people who share the same cluster, but have exactly the same coordinates, into a different cluster (see for example index 9 and 10). Preferably into a cluster which is "relatively close" to the original cluster. I have no exact definition of "relatively close" but the clusters with similar cluster numbers are closer to each other.
Note: make sure that even after shifting the people into different clusters, the cluster size stays the same (9).
Ideally I end up with a dataframe where each cluster contains people who live not at the same place and each cluster contains exactly 9 rows.
The original dataframe has 3k rows. Therefore I need some kind of algorithm to do the job. Any ideas?
Manual solution
# sort values
df.sort_values(by=["cluster", "latitude", "longitude"], inplace=True)
df.reset_index(drop=True, inplace=True)
# swap people
cluster_size = 9
temp_value = df.loc[2, "cluster"]
df.loc[2, "cluster"] = df.loc[2+cluster_size, "cluster"]
df.loc[2+cluster_size, "cluster"] = temp_value
temp_value = df.loc[4, "cluster"]
df.loc[4, "cluster"] = df.loc[4+cluster_size, "cluster"]
df.loc[4+cluster_size, "cluster"] = temp_value
temp_value = df.loc[5, "cluster"]
df.loc[5, "cluster"] = df.loc[5+2*cluster_size, "cluster"]
df.loc[5+2*cluster_size, "cluster"] = temp_value
temp_value = df.loc[18, "cluster"]
df.loc[18, "cluster"] = df.loc[18+cluster_size, "cluster"]
df.loc[18+cluster_size, "cluster"] = temp_value
temp_value = df.loc[20, "cluster"]
df.loc[20, "cluster"] = df.loc[20+cluster_size, "cluster"]
df.loc[20+cluster_size, "cluster"] = temp_value
# end of cluster therefore go backwards
df.sort_values(by=["cluster", "latitude", "longitude"], inplace=True)
shift_value = cluster_size - 1
temp_value = df.loc[30, "cluster"]
df.loc[30, "cluster"] = df.loc[30-shift_value, "cluster"]
df.loc[30-shift_value, "cluster"] = temp_value
This way I end up with the final result:
data = {
"latitude": [49.5633175, 49.5659508, 49.566359, 49.56643220000001, 49.56643221, 49.567729, 49.567729, 49.568089, 49.5687609, 49.5630028, 49.5659508, 49.56643220000001, 49.56643221, 49.5686342, 49.5695834, 49.5706579, 49.5716422, 49.5717749, 49.5619579, 49.5628938, 49.5633175, 49.56397639999999, 49.56397639999999, 49.566359, 49.56643221, 49.5677449, 49.5679685, 49.5619579, 49.5628938, 49.5630028, 49.5672061, 49.5679685, 49.5688543, 49.5690616, 49.5711228, 49.5713705],
"longitude": [10.9993819, 10.9999593, 11.0003023, 10.9999593, 11.001122, 10.9982288, 11.0011213, 10.9894035, 10.9887881, 10.9873409, 10.9873409, 10.9999593, 11.001122, 10.9896749, 10.9851579, 10.9853273, 10.9910182, 10.9867083, 10.9995758, 11.000319, 11.0003023, 10.9999593, 11.004145, 10.9935709, 11.001122, 10.9954016, 10.9982288, 10.9995758, 11.000319, 10.9990996, 10.9935709, 10.9982288, 10.9975928, 10.9931367, 10.9912959, 10.9939141],
"cluster": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3]
}
df = pd.DataFrame(data)
Here is another way to do it with Pandas sample and concat methods:
CLUSTER_SIZE = 9
df = df.drop(columns=["cluster", "dup_location_count"])
df_copy = df.copy(deep=True)
dfs = []
while True:
for cluster_number in range(1 + df_copy.shape[0] // CLUSTER_SIZE):
# Select a sample without duplicated coordinates
while True:
tmp = df_copy.sample(n=CLUSTER_SIZE, replace=False)
if (
tmp.drop_duplicates(subset=["latitude", "longitude"]).shape[0]
== CLUSTER_SIZE
):
break
# Add new cluster number
tmp["cluster"] = cluster_number
dfs.append(tmp)
# Remove sample from original dataframe
df_copy = df_copy.drop(labels=tmp.index)
if df_copy.shape[0] <= CLUSTER_SIZE:
df_copy["cluster"] = cluster_number + 1
dfs.append(df_copy)
break
# Check that no cluster contains duplicates
for item in dfs:
if item.duplicated(subset=["latitude", "longitude"]).sum():
# Start again
df_copy = df.copy(deep=True)
dfs = []
break
else: # if no duplicates found in any cluster, exit loop
break
new_df = pd.concat(dfs).sort_values(
by=["cluster", "latitude", "longitude"],
ignore_index=True,
)
Then:
print(new_df)
# Output
latitude longitude cluster
0 49.562894 11.000319 0
1 49.563003 10.999100 0
2 49.566359 11.000302 0
3 49.568089 10.989403 0
4 49.568761 10.988788 0
5 49.569583 10.985158 0
6 49.571123 10.991296 0
7 49.571371 10.993914 0
8 49.571642 10.991018 0
9 49.561958 10.999576 1
10 49.562894 11.000319 1
11 49.563317 10.999382 1
12 49.565951 10.987341 1
13 49.566432 10.999959 1
14 49.567729 11.001121 1
15 49.567745 10.995402 1
16 49.567968 10.998229 1
17 49.569062 10.993137 1
18 49.561958 10.999576 2
19 49.563976 11.004145 2
20 49.566432 10.999959 2
21 49.567206 10.993571 2
22 49.567968 10.998229 2
23 49.568634 10.989675 2
24 49.568854 10.997593 2
25 49.570658 10.985327 2
26 49.571775 10.986708 2