Search code examples
pythonpython-3.xk-means

Clustering positions on a map where each cluster has an equal number of points


I have specific points on the map and I need them to be grouped to different clusters with the same size and the last cluster can be count %n. I read these answers 1, 2, and 3 but it does not help. I have tried different way but none of them works. In this code, I specific the n_clusters=4 because this is the best number of a cluster that I can sort them and take n best points from the sorted point, and then I will go through all the points. For example, I need the 32 points that shown in the figure to be cluster to 4 clusters and every cluster has 8 points enter image description here

dfcluster = DataFrame(position, columns=['x', 'y'])
kmeans = KMeans(n_clusters=4).fit(dfcluster)
centroids = kmeans.cluster_centers_

# plt.scatter(dfcluster['x'], dfcluster['y'], c=kmeans.labels_.astype(float), s=50, alpha=0.5)
# plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
# plt.show()
dfcluster['cluster'] = kmeans.labels_
dfcluster=dfcluster.drop_duplicates(['x', 'y'], keep='last')
dfcluster = dfcluster.sort_values(['cluster', 'x', 'y'], ascending=True)
# d=pd.DataFrame()
# m = pd.DataFrame()
# n=8
# for x in range(4) :
#     m= dfcluster[dfcluster.cluster == x]
#
#
#     if len(m) > int( n /2)-1:
#         m=m.head(int(n/2)-1)
#         # for idx, row in m.iterrows():
#         #     print("code3 group",  "=", row['cluster'])
#         d=d.append(m,ignore_index = True)
#
#     else :
#         d=d.append(m,ignore_index = True)
#
#
# if len(d)>=n:
#     dfcluster = d
# dfcluster.groupby('cluster').nth(n))
dfcluster=dfcluster.head(n)
i=0
if (len(dfcluster )< n):
   change_df()

Solution

  • I found this module that uses the Same Size Constrained K-Means Heuristics: Use Heuristics methods to reach the same size clustering where it gives the same size of the group.

    I start with pip install size-constrained-clustering or pip install git+https://github.com/jingw2/size_constrained_clustering.git, and you can use minmax flow or Heuristics.

    n_samples = 2000
    n_clusters = 3
    X = np.random.rand(n_samples, 2)
    
    model = equal.SameSizeKMeansMinCostFlow(n_clusters)
    
    #model = equal.SameSizeKMeansHeuristics(n_clusters)
    model.fit(X)
    centers = model.cluster_centers_
    labels = model.labels_
    

    If you have a problem with the size-constrained-clustering module, you can use these classes, but you need to install k-means-constrained

    pip install k-means-constrained

    SameSizeKMeansMinCostFlow class

    from k_means_constrained import KMeansConstrained
    import warnings
    import base
    from scipy.spatial.distance import cdist
    class SameSizeKMeansMinCostFlow(base.Base):
    
        def __init__(self, n_clusters, max_iters=1000, distance_func=cdist, random_state=42):
            '''
            Args:
                n_clusters (int): number of clusters
                max_iters (int): maximum iterations
                distance_func (object): callable function with input (X, centers) / None, by default is l2-distance
                random_state (int): random state to initiate, by default it is 42
            '''
            super(SameSizeKMeansMinCostFlow, self).__init__(n_clusters, max_iters, distance_func)
            self.clf = None
    
        def fit(self, X):
            n_samples, n_features = X.shape
            minsize = n_samples // self.n_clusters
            maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters
    
            clf = KMeansConstrained(self.n_clusters, size_min=minsize,
                                    size_max=maxsize)
    
            if minsize != maxsize:
                warnings.warn("Cluster minimum and maximum size are {} and {}, respectively".format(minsize, maxsize))
    
            clf.fit(X)
    
            self.clf = clf
            self.cluster_centers_ = self.clf.cluster_centers_
            self.labels_ = self.clf.labels_
    
        def predict(self, X):
            return self.clf.predict(X)
    

    base class

    #!usr/bin/python 3.7
    #-*-coding:utf-8-*-
    
    '''
    @file: base.py, base for clustering algorithm
    @Author: Jing Wang ([email protected])
    @Date: 06/07/2020
    '''
    from scipy.spatial.distance import cdist
    import numpy as np 
    import warnings
    import scipy.sparse as sp
    
    import os 
    import sys 
    path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(path)
    from k_means_constrained.sklearn_import.utils.extmath import stable_cumsum
    
    class Base(object):
    
        def __init__(self, n_clusters, max_iters, distance_func=cdist):
            '''
            Base Cluster object
    
            Args:
                n_clusters (int): number of clusters 
                max_iters (int): maximum iterations
                distance_func (callable function): distance function callback
            '''
            assert isinstance(n_clusters, int)
            assert n_clusters >= 1
            assert isinstance(max_iters, int)
            assert max_iters >= 1
            self.n_clusters = n_clusters 
            self.max_iters = max_iters
            if distance_func is not None and not callable(distance_func):
                raise Exception("Distance function is not callable")
            self.distance_func = distance_func
    
        def fit(self, X):
            pass 
    
        def predict(self, X):
            pass 
    
    def k_init(X, n_clusters, x_squared_norms, random_state=42, distance_func=cdist, n_local_trials=None):
        """Init n_clusters seeds according to k-means++
    
        Parameters
        ----------
        X : array or sparse matrix, shape (n_samples, n_features)
            The data to pick seeds for. To avoid memory copy, the input data
            should be double precision (dtype=np.float64).
    
        n_clusters : integer
            The number of seeds to choose
    
        x_squared_norms : array, shape (n_samples,)
            Squared Euclidean norm of each data point.
    
        random_state : int, RandomState instance
            The generator used to initialize the centers. Use an int to make the
            randomness deterministic.
            See :term:`Glossary <random_state>`.
    
        n_local_trials : integer, optional
            The number of seeding trials for each center (except the first),
            of which the one reducing inertia the most is greedily chosen.
            Set to None to make the number of trials depend logarithmically
            on the number of seeds (2+log(k)); this is the default.
    
        Notes
        -----
        Selects initial cluster centers for k-mean clustering in a smart way
        to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
        "k-means++: the advantages of careful seeding". ACM-SIAM symposium
        on Discrete algorithms. 2007
    
        Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
        which is the implementation used in the aforementioned paper.
        """
        n_samples, n_features = X.shape
    
        centers = np.empty((n_clusters, n_features), dtype=X.dtype)
    
        assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
    
        # Set the number of local seeding trials if none is given
        if n_local_trials is None:
            # This is what Arthur/Vassilvitskii tried, but did not report
            # specific results for other than mentioning in the conclusion
            # that it helped.
            n_local_trials = 2 + int(np.log(n_clusters))
    
        # Pick first center randomly
        center_id = random_state.randint(n_samples)
        if sp.issparse(X):
            centers[0] = X[center_id].toarray()
        else:
            centers[0] = X[center_id]
    
        # Initialize list of closest distances and calculate current potential
        closest_dist_sq = distance_func(
            centers[0, np.newaxis], X)
        current_pot = closest_dist_sq.sum()
    
        # Pick the remaining n_clusters-1 points
        for c in range(1, n_clusters):
            # Choose center candidates by sampling with probability proportional
            # to the squared distance to the closest existing center
            rand_vals = random_state.random_sample(n_local_trials) * current_pot
            candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
                                            rand_vals)
            # XXX: numerical imprecision can result in a candidate_id out of range
            np.clip(candidate_ids, None, closest_dist_sq.size - 1,
                    out=candidate_ids)
    
            # Compute distances to center candidates
            # distance_to_candidates = euclidean_distances(
            #     X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
            distance_to_candidates = distance_func(X[candidate_ids], X)
    
            # update closest distances squared and potential for each candidate
            np.minimum(closest_dist_sq, distance_to_candidates,
                       out=distance_to_candidates)
            candidates_pot = distance_to_candidates.sum(axis=1)
    
            # Decide which candidate is the best
            best_candidate = np.argmin(candidates_pot)
            current_pot = candidates_pot[best_candidate]
            closest_dist_sq = distance_to_candidates[best_candidate]
            best_candidate = candidate_ids[best_candidate]
    
            # Permanently add best center candidate found in local tries
            if sp.issparse(X):
                centers[c] = X[best_candidate].toarray()
            else:
                centers[c] = X[best_candidate]
    
        return centers