Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

test_k_means_fit_predict failing on some MacPython runs

See original GitHub issue

KMeans fit_predict(X) != fit(X).predict(X) in several cases in

https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223437 https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223439

_________ test_k_means_fit_predict[0-2-1e-07-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 0, max_iter = 2
tol = 1e-07
    @pytest.mark.parametrize('algo', ['full', 'elkan'])
    @pytest.mark.parametrize('dtype', [np.float32, np.float64])
    @pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
    @pytest.mark.parametrize('seed, max_iter, tol', [
        (0, 2, 1e-7),    # strict non-convergence
        (1, 2, 1e-1),    # loose non-convergence
        (3, 300, 1e-7),  # strict convergence
        (4, 300, 1e-1),  # loose convergence
    ])
    def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
        # check that fit.predict gives same result as fit_predict
        # There's a very small chance of failure with elkan on unstructured dataset
        # because predict method uses fast euclidean distances computation which
        # may cause small numerical instabilities.
        if not (algo == 'elkan' and constructor is sp.csr_matrix):
            rng = np.random.RandomState(seed)
    
            X = make_blobs(n_samples=1000, n_features=10, centers=10,
                           random_state=rng)[0].astype(dtype, copy=False)
            X = constructor(X)
    
            kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                            tol=tol, max_iter=max_iter, n_jobs=1)
    
            labels_1 = kmeans.fit(X).predict(X)
            labels_2 = kmeans.fit_predict(X)
    
>           assert_array_equal(labels_1, labels_2)
E           AssertionError: 
E           Arrays are not equal
E           
E           (mismatch 80.0%)
E            x: array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
E                  2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0, 2, 0, 9, 1, 9, 4, 3, 1, 5, 4, 1,
E                  6, 3, 5, 9, 3, 9, 5, 4, 8, 2, 2, 0, 5, 7, 3, 7, 4, 9, 8, 6, 9, 0, 6,...
E            y: array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
E                  3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1, 3, 1, 8, 2, 8, 5, 4, 2, 0, 5, 2,
E                  6, 4, 0, 8, 4, 8, 0, 5, 9, 3, 3, 1, 0, 7, 4, 7, 5, 8, 9, 6, 8, 1, 6,...
X          = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>
algo       = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype      = <class 'numpy.float64'>
kmeans     = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=2,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=1e-07, verbose=0)
labels_1   = array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
       2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0,...3, 5, 1, 3, 3, 2, 3, 5, 4, 8, 8, 0, 8, 1, 7, 3, 6, 2, 2, 6, 3, 3,
       3, 3, 8, 3, 7, 9, 8, 9, 5, 4, 2], dtype=int32)
labels_2   = array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
       3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1,...4, 0, 2, 4, 4, 3, 4, 0, 5, 9, 9, 1, 9, 2, 7, 4, 6, 3, 3, 6, 4, 4,
       4, 4, 9, 4, 7, 8, 9, 8, 0, 5, 3], dtype=int32)
max_iter   = 2
rng        = <mtrand.RandomState object at 0x114933ea0>
seed       = 0
tol        = 1e-07
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError
_________ test_k_means_fit_predict[4-300-0.1-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 4, max_iter = 300
tol = 0.1
    @pytest.mark.parametrize('algo', ['full', 'elkan'])
    @pytest.mark.parametrize('dtype', [np.float32, np.float64])
    @pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
    @pytest.mark.parametrize('seed, max_iter, tol', [
        (0, 2, 1e-7),    # strict non-convergence
        (1, 2, 1e-1),    # loose non-convergence
        (3, 300, 1e-7),  # strict convergence
        (4, 300, 1e-1),  # loose convergence
    ])
    def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
        # check that fit.predict gives same result as fit_predict
        # There's a very small chance of failure with elkan on unstructured dataset
        # because predict method uses fast euclidean distances computation which
        # may cause small numerical instabilities.
        if not (algo == 'elkan' and constructor is sp.csr_matrix):
            rng = np.random.RandomState(seed)
    
            X = make_blobs(n_samples=1000, n_features=10, centers=10,
                           random_state=rng)[0].astype(dtype, copy=False)
            X = constructor(X)
    
            kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                            tol=tol, max_iter=max_iter, n_jobs=1)
    
            labels_1 = kmeans.fit(X).predict(X)
            labels_2 = kmeans.fit_predict(X)
    
>           assert_array_equal(labels_1, labels_2)
E           AssertionError: 
E           Arrays are not equal
E           
E           (mismatch 100.0%)
E            x: array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
E                  3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0, 6, 4, 6, 0, 4, 4, 9, 6, 1, 2, 0,
E                  2, 5, 1, 4, 9, 1, 5, 3, 9, 6, 6, 9, 9, 8, 7, 1, 6, 2, 7, 0, 9, 1, 3,...
E            y: array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
E                  7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2, 4, 3, 4, 2, 3, 3, 1, 4, 5, 8, 2,
E                  8, 6, 5, 3, 1, 5, 6, 7, 1, 4, 4, 1, 1, 0, 9, 5, 4, 8, 9, 2, 1, 5, 7,...
X          = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>
algo       = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype      = <class 'numpy.float64'>
kmeans     = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=4, tol=0.1, verbose=0)
labels_1   = array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
       3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0,...0, 0, 8, 5, 8, 2, 4, 7, 3, 3, 6, 8, 5, 7, 1, 2, 7, 1, 4, 9, 9, 5,
       4, 2, 2, 7, 5, 9, 8, 4, 9, 0, 1], dtype=int32)
labels_2   = array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
       7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2,...2, 2, 0, 6, 0, 8, 3, 9, 7, 7, 4, 0, 6, 9, 5, 8, 9, 5, 3, 1, 1, 6,
       3, 8, 8, 9, 6, 1, 0, 3, 1, 2, 5], dtype=int32)
max_iter   = 300
rng        = <mtrand.RandomState object at 0x1141c9708>
seed       = 4
tol        = 0.1
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError

Issue Analytics

State:
Created 5 years ago
Comments:22 (22 by maintainers)

Top GitHub Comments

1reaction

jeremiedbbcommented, Nov 22, 2018

I don’t think this is the case: there is an explicit n_jobs=1.

I missed that… Another confirmation it’s not this is that the test only fail with algorithm="full" if I saw correctly.

The difference between “full” and “elkan” which can cause this issue is that “full” uses the fast method to compute euclidean distances, the one with the precision issue. The inertia returned by the current implementation of k-means can strongly differ from the exact inertia. This can lead to different inertia being computed with only permutations of the labels.

0reactions

jeremiedbbcommented, Mar 22, 2022

There’s been improvements to the stability of the algorithm. The skip has been removed for a while now and we’ve not seen failures since then. Closing (definitively I hope 😃 )