为什么在python中SSE会以特定的集群数量增加?(K表示聚类)

2024-05-13 03:33:19 发布

您现在位置:Python中文网/ 问答频道 /正文

我使用具有不同标签数组的相同数据集计算SSE,代码如下:

import sklearn
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
import os
import pandas as pd
import xlrd
import pickle
import csv
from numpy import savetxt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import matplotlib.cm as cm

n_cluster = 8
final = []
array_final = []
label = []
acabou = []

for c in range (0,n_cluster):

    for n in range (2,7):
        xlsx = pd.ExcelFile("C:/Users/guilh/Desktop/SENSORIAMENTO/SENTINEL_2/{}/Cluster_{}/MEDIANA_BANDA_CLUSTER{}{}.xlsx".format(n_cluster,c,n,c))
        df = pd.read_excel(xlsx, 'Sheet1', keep_default_na = False)

        numpy_array = df.to_numpy()
        one_column_array = numpy_array.flatten(order="A")
        one_column_array = [var for var in one_column_array if var]
        final = one_column_array
        array_final.append(np.char.replace(final, ',', '.'))  

 
    #criando uma array com o número de cada Cluster
    x = len(one_column_array)  
    acabou.append(np.stack(array_final, axis=-1))
    seila = np.empty(x, np.int64 ,  order='C')
    seila.fill(c+1)
    label.append(seila)
    array_final = []
    
acabou_valor = np.concatenate(acabou)
acabou_label = np.concatenate(label)

#CALCULANDO SSE 

BANDA = []
for i in range (0,5):
    aa = acabou_valor[:,i].astype(np.float)
    BANDA.append(aa)

soma = []

#SSE for each cluster in each band

for i in range (0,n_cluster):
    for j in range (0,5):
        aa = BANDA[j][np.nonzero(acabou_label == i+1)]
        mm = np.mean(aa)
        bb = np.square(np.subtract(aa, mm))
        cc = bb.sum()
        soma.append(cc)

cu = []
cu2 = []
cu3 = []
cu4 = []
cu5 = []
cu6 = []
cu7 = []
cu8 = []

for i in range(0,5):
    #print(soma[i])
    a = soma[i]
    cu.append(a)

cluster1 = []
cluster1.append(sum(cu))

for i in range(5,10):
    #print(soma[i])
    a = soma[i]
    cu2.append(a)

cluster2 = []
cluster2.append(sum(cu2))

for i in range(10,15):
    #print(soma[i])
    a = soma[i]
    cu3.append(a)

cluster3 = []
cluster3.append(sum(cu3))

for i in range(15,20):
    #print(soma[i])
    a = soma[i]
    cu4.append(a)

cluster4 = []
cluster4.append(sum(cu4))

for i in range(20,25):
    #print(soma[i])
    a = soma[i]
    cu5.append(a)

cluster5 = []
cluster5.append(sum(cu5))

for i in range(25,30):
    #print(soma[i])
    a = soma[i]
    cu6.append(a)

cluster6 = []
cluster6.append(sum(cu6))

for i in range(30,35):
    #print(soma[i])
    a = soma[i]
    cu7.append(a)

cluster7 = []
cluster7.append(sum(cu7))

for i in range(35,40):
    #print(soma[i])
    a = soma[i]
    cu8.append(a)

cluster8 = []
cluster8.append(sum(cu8))

SSE_KMEANS = [cluster1[i]+cluster2[i]+cluster3[i]+cluster4[i]+cluster5[i]+cluster6[i]+cluster7[i]+cluster8[i] for i in range(len(cluster1))]

print(SSE_KMEANS)
print(cluster1)
print(cluster2)
print(cluster3)
print(cluster4)
print(cluster5)
print(cluster6)
print(cluster7)
print(cluster8)

一切都很好,SSE随着K的上升而下降(如预期),直到我尝试用K=8计算它,它相对于K=7增加了近50。我检查了代码好几次,看看是否有任何计算错误,但没有找到任何东西。有人能帮我吗

代码的第一部分用于从转换为.xls的光栅创建数据集,我甚至能够用它计算每个集群的轮廓索引,所以我认为那里一切都很好。第二部分为上证综指

K意味着集群已经在GEE上完成了,我只是在python上处理导出的数据

我正在处理Sentinel-2图像中的聚类;和斯派德一起穿过Python;python版本3.8.3


Tags: inimportfornprangesklearnarraysse