数据帧的层次聚类丢失一行,之后如何设置索引?

2024-05-29 06:00:56 发布

您现在位置:Python中文网/ 问答频道 /正文

如何从scipy.cluster.hierarchy.linkage得到的集群?在

from scipy.cluster.hierarchy import linkage
import pandas as pd
from StringIO import StringIO

s = """Time                              0          3          6          9
chr10_101291200_101294799  11.971746  11.350644  11.288784  10.182320   
chr10_102570400_102570999  -1.000000  -1.000000   8.646441   6.977979   
chr10_103534000_103535599  -1.000000  -1.000000   9.745016   8.296470   
chr10_104174000_104178599  -1.000000  -1.000000  10.946175  10.527323   
chr10_105927000_105928999  11.249397  11.149052  10.047420   9.780421"""

df = pd.read_table(StringIO(s), sep="\s+", header=0, index_col=0)
ldf = linkage(df, metric="correlation")
ldf = pd.DataFrame(ldf)  

#    0  1         2  3
# 0  1  2  0.000488  2
# 1  3  5  0.002935  3
# 2  0  4  0.183986  2
# 3  6  7  1.631157  5

Tags: fromimportpandasdfhierarchytimeas集群
1条回答
网友
1楼 · 发布于 2024-05-29 06:00:56
from collections import defaultdict

def get_sets_from_linkage_matrix(original_index, linkage_matrix):
    d = defaultdict(set)
    for ix, island in enumerate(original_index):
        d[ix].add(island) 

    for ix, (c1, c2) in enumerate(zip(linkage_matrix[0].astype(int),
                                      linkage_matrix[1].astype(int)),
                                      len(d)):
        d[ix].update(d[c1])
        d[ix].update(d[c2])

    return d

sets = get_sets_from_linkage_matrix(df.index, ldf)
print(sets)

# defaultdict(set,
#             {0: {'chr10_101291200_101294799'},
#              1: {'chr10_102570400_102570999'},
#              2: {'chr10_103534000_103535599'},
#              3: {'chr10_104174000_104178599'},
#              4: {'chr10_105927000_105928999'},
#              5: {'chr10_102570400_102570999', 'chr10_103534000_103535599'},
#              6: {'chr10_102570400_102570999',
#               'chr10_103534000_103535599',
#               'chr10_104174000_104178599'},
#              7: {'chr10_101291200_101294799', 'chr10_105927000_105928999'},
#              8: {'chr10_101291200_101294799',
#               'chr10_102570400_102570999',
#               'chr10_103534000_103535599',
#               'chr10_104174000_104178599',
#               'chr10_105927000_105928999'}})

相关问题 更多 >

    热门问题