慢速NetworkX图形创建

edges = [] edges_attrs = {} columns = list(dtm.columns) for key in dtm.columns: for key1 in columns: # skip the same node if key == key1: continue df = dtm.loc[(dtm[key] != 0) & (dtm[key1] != 0), [key, key1]] docs = df.shape[0] edges.append((key, key1)) edges_attrs[(key, key1)] = {'docs': docs} # no double arches (u, v) == (v, u) columns.remove(key) graph.add_edges_from(edges) nx.set_edge_attributes(graph, edges_attrs)

1条回答

网友

1楼 · 发布于 2024-04-23 10:37:38

不要使用for循环。了解数据库中的内部联接和外部联接。SQL入门课程将涵盖这些概念。然后，将它们应用于数据帧非常简单：

#!/usr/bin/env python
"""
https://stackoverflow.com/q/62406586/2912349
"""
import numpy as np
import pandas as pd

# simulate some data
x = pd.DataFrame(np.random.normal(0, 1, (4,4)), index=['a', 'b', 'c', 'd'], columns=['e', 'f', 'g', 'h'])
x[:] = x > 0

#        e      f      g      h
# a  False  False   True  False
# b  False  False  False   True
# c   True   True   True   True
# d  False   True   True   True

sparse = pd.DataFrame(x[x > 0].stack().index.tolist(), columns=['Documents', 'Terms'])

#   Documents Terms
# 0         a     g
# 1         b     h
# 2         c     e
# 3         c     f
# 4         c     g
# 5         c     h
# 6         d     f
# 7         d     g
# 8         d     h

cooccurrences = pd.merge(sparse, sparse, how='inner', on='Documents')

#    Documents Terms_x Terms_y
# 0          a       g       g
# 1          b       h       h
# 2          c       e       e
# 3          c       e       f
# 4          c       e       g
# 5          c       e       h
# 6          c       f       e
# 7          c       f       f
# 8          c       f       g
# 9          c       f       h
# 10         c       g       e
# 11         c       g       f
# 12         c       g       g
# 13         c       g       h
# 14         c       h       e
# 15         c       h       f
# 16         c       h       g
# 17         c       h       h
# 18         d       f       f
# 19         d       f       g
# 20         d       f       h
# 21         d       g       f
# 22         d       g       g
# 23         d       g       h
# 24         d       h       f
# 25         d       h       g
# 26         d       h       h

# remove self loops and repeat pairings such as the second tuple in (u, v), (v, u)
valid = cooccurrences['Terms_x'] > cooccurrences['Terms_y']
valid_cooccurrences = cooccurrences[valid]

#    Documents Terms_x Terms_y
# 6          c       f       e
# 10         c       g       e
# 11         c       g       f
# 14         c       h       e
# 15         c       h       f
# 16         c       h       g
# 21         d       g       f
# 24         d       h       f
# 25         d       h       g

counts = valid_cooccurrences.groupby(['Terms_x', 'Terms_y']).count()

#                  Documents
# Terms_x Terms_y
# f       e                1
# g       e                1
#         f                2
# h       e                1
#         f                2
#         g                2

documents = valid_cooccurrences.groupby(['Terms_x', 'Terms_y']).aggregate(lambda x : set(x))

#                 Documents
# Terms_x Terms_y
# f       e             {c}
# g       e             {c}
#         f          {d, c}
# h       e             {c}
#         f          {d, c}
#         g          {d, c}

相关问题更多 >

编程相关推荐

热门问题

热门文章