Python中的Clauset-Newman-Moore社区检测算法
仅供参考:这不是一个作业任务
我尝试在Python中实现Clauset-Newman-Moore社区检测算法,虽然程序可以运行,但它输出的模块度(Q值)总是有些偏差。完整的代码在下面,它接受一种文本文件格式,类似于这个。
我并不指望有人能给出一个完整的解决方案来修复所有代码,但如果能给我一些可能出错的提示,我将非常感激!!
一些可能重要的说明:
- Python的heapq模块使用的是最小堆,而这个算法需要最大堆,所以Q值存储为-Q。
- 出于同样的原因,Q值的算术运算也进行了交换(用减法代替加法等等)。
#!/usr/bin/env python
'''
Usage:
python cnm.py <input_file> <output_file>
'''
import heapq
import sys
import time
from pprint import pprint
def read_input(filename):
''' Loads the input into a dictionary'''
output_dict = {}
with open(filename, 'r') as f:
for line in f:
l = line.split('\t')
key = int(l[0].strip())
value = int(l[1].strip())
if key not in output_dict:
output_dict[key] = []
if value not in output_dict:
output_dict[value] = []
if value in output_dict[key]:
pass
else:
output_dict[key].append(value)
if key in output_dict[value]:
pass
else:
output_dict[value].append(key)
return output_dict
def calculate_m(input_dict):
''' Gives the total number of edges in the network. '''
total = 0
for key in input_dict:
total += len(input_dict[key])
return total / 2
def calculate_deltaQ(m, ki, kj):
''' Calculates deltaQ for two communities i and j '''
deltaQ = 1.0/(2.0*m) - float(ki*kj) / ((2*m)**2)
return deltaQ
def populate_Qtrees(input_dict, m):
Qtrees = {}
for i in input_dict:
community = input_dict[i]
ki = len(community)
Qtrees[i] = {}
for j in community:
kj = len(input_dict[j])
Qtrees[i][j] = calculate_deltaQ(m, ki, kj)
return Qtrees
def populate_Qheaps(input_dict, m):
Qheaps = {}
for key in input_dict:
community = input_dict[key]
ki = len(community)
Qheaps[key] = []
for i in community:
kj = len(input_dict[i])
deltaQ = calculate_deltaQ(m, ki, kj)
# we store the items in the heap as their negative values because
# python heap is a min-heap
heapq.heappush(Qheaps[key], (-deltaQ, i, key))
return Qheaps
def populate_H(Qheaps):
H = []
for key in Qheaps:
if Qheaps[key] == []:
continue
else:
maximum = Qheaps[key][0]
heapq.heappush(H, maximum)
return H
def populate_a(input_dict, m):
a = {}
for key in input_dict:
k = len(input_dict[key])
ai = float(k) / (2.0 * m)
a[key] = ai
return a
def select_largest_q(H):
return heapq.heappop(H)
def update_Qtrees(Qtrees, a, i, j):
# from equation 10a - summing i into j
for key in Qtrees[i]:
if key in Qtrees[j]:
Qtrees[j][key] = Qtrees[i][key] - Qtrees[j][key]
# from equations 10b and 10c - update row j
for key in Qtrees:
if key in Qtrees[i] and key not in Qtrees[j]:
Qtrees[j][key] = Qtrees[i][key] + (2 * a[j] * a[key])
elif key in Qtrees[j] and key not in Qtrees[i]:
Qtrees[j][key] = Qtrees[j][key] + (2 * a[i] * a[key])
# remove i key and update j for each row k
for key in Qtrees:
if i in Qtrees[key]:
Qtrees[key].pop(i, None)
if j in Qtrees[key]:
Qtrees[key][j] = Qtrees[key][j] + (2 * a[i] * a[key])
# remove the self-reference (necessary because our tree is a python dict)
if j in Qtrees[j]:
Qtrees[j].pop(j, None)
# remove i
Qtrees.pop(i, None)
return Qtrees
def update_Qheaps(Qtrees, Qheaps, i, j):
# remove the heap i
Qheaps.pop(i, None)
# rebuild the jth heap from the jth binary tree in Qtree
community = Qtrees[j]
h = [ (community[key], key, j) for key in community ] # list comprehension
heapq.heapify(h)
Qheaps[j] = h
# remove the ith and update the jth element in each heap
for key in Qheaps:
heap = Qheaps[key]
for item in heap[:]:
if item[1] == i:
heap.remove(item)
heapq.heapify(heap)
elif item[1] == j:
# we temporarily change the item to a list to perform insertion
# (tuples are immutable)
item_copy = list(item)
heap.remove(item)
item_copy[0] = Qtrees[key][j]
heapq.heappush(heap, tuple(item_copy))
return Qheaps
def update_a(a, i, j):
a[j] += a[i]
a[i] = 0
return a
def main():
''' Main loop of the program. '''
# read command line input
filename = sys.argv[1]
maxQ = 0
max_step = 0
Q = 0
input_dict = read_input(filename)
m = calculate_m(input_dict)
nodes = len(input_dict)
Qtrees = populate_Qtrees(input_dict, m)
Qheaps = populate_Qheaps(input_dict, m)
H = populate_H(Qheaps)
a = populate_a(input_dict, m)
step = 0
print 'i', '\t', 'j', '\t', 'Q', '\t\t', 'deltaQ', '\t\t', 'step'
while H:
deltaQ, i, j = select_largest_q(H)
Q -= deltaQ
Qtrees = update_Qtrees(Qtrees, a, i, j)
Qheaps = update_Qheaps(Qtrees, Qheaps, i, j)
H = populate_H(Qheaps)
a = update_a(a, i, j)
step += 1
print i, '\t', j, '\t', round(Q, 7), '\t', round(deltaQ, 7), '\t', step
if deltaQ < 0:
maxQ = deltaQ
max_step = step
else:
pass
output_file = sys.argv[2]
with open(output_file, 'w+') as f:
f.write(
'''FASTCOMMUNITY_INFERENCE_ALGORITHM in python!
START-----: {0}
---NET_STATS----
NUMNODES--: {1}
NUMEDGES--: {2}
---MODULARITY---
MAXQ------: {3}
STEP------: {4}
EXIT------: {5}'''.format(time.asctime(),
nodes,
m,
maxQ,
max_step,
time.asctime() ))
if __name__ == '__main__':
main()
3 个回答
0
我觉得你的代码 -
要在每一行k中移除i键,并更新j
for key in Qtrees:
if i in Qtrees[key]:
Qtrees[key].pop(i, None)
if j in Qtrees[key]:
Qtrees[key][j] = Qtrees[key][j] + (2 * a[i] * a[key])
更新Qtree是个问题,因为在论文中并没有提到要这样写代码 -
如果j在Qtrees[key]中:
Qtrees[key][j] = Qtrees[key][j] + (2 * a[i] * a[key])
你应该把这段代码删掉,然后再试试。
1
我有一个担心的事情:
heap.remove(item)
item_copy[0] = Qtrees[key][j]
heapq.heappush(heap, tuple(item_copy))
heap.remove(item) 这个操作会把叫做 heap 的列表中的 item 移除掉,这样就破坏了堆的特性。
换句话说,在这个步骤之后,你的 heap 变量可能就不再是一个堆了。
也许在执行 heap.remove(item) 之后,调用一下
heapq.heapify(heap)
会有所帮助。
1
看起来你在做很多堆操作。也许等你解决了当前的问题后,直接用一个平衡的二叉搜索树会更好。慢一点但能正常工作的代码,总比快但不管用的代码要好。
- Python的heapq模块提供的是一个最小堆,但我这个算法需要的是最大堆,所以我把Q的值存成了-Q。
- 出于同样的原因,Q值的算术运算也进行了调整(用减法代替加法等等)。
第二个问题是由第一个问题引起的。如果Q存成了-Q,我们就叫它q。那么A + q实际上是A + (-Q),也就是A + Q。这可能就是你遇到的问题。最好的解决办法可能是用-Q来反转符号的变化。