如何优化我的PageRank计算?
在书籍《编程集体智能》中,我找到了一个计算PageRank的函数:
def calculatepagerank(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
for i in range(iterations):
print "Iteration %d" % i
for (urlid,) in self.con.execute("select rowid from urllist"):
pr=0.15
# Loop through all the pages that link to this one
for (linker,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
# Get the PageRank of the linker
linkingpr=self.con.execute("select score from pagerank where urlid=%d" % linker).fetchone()[0]
# Get the total number of links from the linker
linkingcount=self.con.execute("select count(*) from link where fromid=%d" % linker).fetchone()[0]
pr+=0.85*(linkingpr/linkingcount)
self.con.execute("update pagerank set score=%f where urlid=%d" % (pr,urlid))
self.dbcommit()
不过,这个函数运行得很慢,因为每次迭代都要进行很多SQL查询。
>>> import cProfile
>>> cProfile.run("crawler.calculatepagerank()")
2262510 function calls in 136.006 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 136.006 136.006 <string>:1(<module>)
1 20.826 20.826 136.006 136.006 searchengine.py:179(calculatepagerank)
21 0.000 0.000 0.528 0.025 searchengine.py:27(dbcommit)
21 0.528 0.025 0.528 0.025 {method 'commit' of 'sqlite3.Connecti
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
1339864 112.602 0.000 112.602 0.000 {method 'execute' of 'sqlite3.Connec
922600 2.050 0.000 2.050 0.000 {method 'fetchone' of 'sqlite3.Cursor'
1 0.000 0.000 0.000 0.000 {range}
于是我对这个函数进行了优化,得到了这个:
def calculatepagerank2(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
inlinks={}
numoutlinks={}
pagerank={}
for (urlid,) in self.con.execute("select rowid from urllist"):
inlinks[urlid]=[]
numoutlinks[urlid]=0
# Initialize pagerank vector with 1.0
pagerank[urlid]=1.0
# Loop through all the pages that link to this one
for (inlink,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
inlinks[urlid].append(inlink)
# get number of outgoing links from a page
numoutlinks[urlid]=self.con.execute("select count(*) from link where fromid=%d" % urlid).fetchone()[0]
for i in range(iterations):
print "Iteration %d" % i
for urlid in pagerank:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
for urlid in pagerank:
self.con.execute("update pagerank set score=%f where urlid=%d" % (pagerank[urlid],urlid))
self.dbcommit()
这个函数快了很多(但需要更多的内存来存储临时字典),因为它避免了每次迭代中不必要的SQL查询:
>>> cProfile.run("crawler.calculatepagerank2()")
90070 function calls in 3.527 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.004 0.004 3.527 3.527 <string>:1(<module>)
1 1.154 1.154 3.523 3.523 searchengine.py:207(calculatepagerank2
2 0.000 0.000 0.058 0.029 searchengine.py:27(dbcommit)
23065 0.013 0.000 0.013 0.000 {method 'append' of 'list' objects}
2 0.058 0.029 0.058 0.029 {method 'commit' of 'sqlite3.Connectio
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
43932 2.261 0.000 2.261 0.000 {method 'execute' of 'sqlite3.Connecti
23065 0.037 0.000 0.037 0.000 {method 'fetchone' of 'sqlite3.Cursor'
1 0.000 0.000 0.000 0.000 {range}
但是,是否还有可能进一步减少SQL查询的数量,以便让这个函数运行得更快呢? 更新:修正了calculatepagerank2()中的缩进问题。
4 个回答
我认为大部分时间都花在这些SQL查询上:
for (urlid,) in self.con.execute("select rowid from urllist"):
...
for (inlink,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
...
numoutlinks[urlid]=self.con.execute("select count(*) from link where fromid=%d" % urlid).fetchone()[0]
假设你的内存足够,你可以把这个过程简化为仅仅两个查询:
SELECT fromid,toid FROM link WHERE toid IN (SELECT rowid FROM urllist)
还有SELECT fromid,count(*) FROM link WHERE fromid IN (SELECT rowid FROM urllist) GROUP BY fromid
然后你可以遍历结果,构建 inlinks
、numoutlinks
和 pagerank
。
你还可以考虑使用 collections.defaultdict
:
import collections
import itertools
def constant_factory(value):
return itertools.repeat(value).next
接下来,这样做会让 inlinks
成为一个集合的字典。使用集合是合适的,因为你只想要不同的URL。
inlinks=collections.defaultdict(set)
而这会让 pagerank
成为一个默认值为1.0的字典:
pagerank=collections.defaultdict(constant_factory(1.0))
使用 collections.defaultdict 的好处是你不需要提前初始化这些字典。
所以,综合起来,我建议的做法大概是这样的:
import collections
def constant_factory(value):
return itertools.repeat(value).next
def calculatepagerank2(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("DROP TABLE IF EXISTS pagerank")
self.con.execute("CREATE TABLE pagerank(urlid primary key,score)")
self.con.execute("CREATE INDEX prankidx ON pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("INSERT INTO pagerank SELECT rowid,1.0 FROM urllist")
self.dbcommit()
inlinks=collections.defaultdict(set)
sql='''SELECT fromid,toid FROM link WHERE toid IN (SELECT rowid FROM urllist)'''
for f,t in self.con.execute(sql):
inlinks[t].add(f)
numoutlinks={}
sql='''SELECT fromid,count(*) FROM link WHERE fromid IN (SELECT rowid FROM urllist) GROUP BY fromid'''
for f,c in self.con.execute(sql):
numoutlinks[f]=c
pagerank=collections.defaultdict(constant_factory(1.0))
for i in range(iterations):
print "Iteration %d" % i
for urlid in inlinks:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
sql="UPDATE pagerank SET score=? WHERE urlid=?"
args=((pagerank[urlid],urlid) for urlid in pagerank)
self.con.executemany(sql, args)
self.dbcommit()
如果你有一个非常大的数据库(比如记录数量差不多和互联网网页数量一样多),那么按照书里建议的方式使用数据库是有道理的,因为你不可能把所有的数据都放在内存里。
如果你的数据集比较小,你可以(可能)通过减少查询次数来改进你的第二个版本。试着把你的第一个循环换成下面这样的:
for urlid, in self.con.execute('select rowid from urllist'):
inlinks[urlid] = []
numoutlinks[urlid] = 0
pagerank[urlid] = 1.0
for src, dest in self.con.execute('select fromid, toid from link'):
inlinks[dest].append(src)
numoutlinks[src] += 1
这个版本只需要执行2次查询,而不是O(n^2)次查询。
我来回答我自己的问题,因为最后发现结合了所有答案的做法对我最有效:
def calculatepagerank4(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
inlinks={}
numoutlinks={}
pagerank={}
for (urlid,) in self.con.execute("select rowid from urllist"):
inlinks[urlid]=[]
numoutlinks[urlid]=0
# Initialize pagerank vector with 1.0
pagerank[urlid]=1.0
for src,dest in self.con.execute("select distinct fromid, toid from link"):
inlinks[dest].append(src)
numoutlinks[src]+=1
for i in range(iterations):
print "Iteration %d" % i
for urlid in pagerank:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
args=((pagerank[urlid],urlid) for urlid in pagerank)
self.con.executemany("update pagerank set score=? where urlid=?" , args)
self.dbcommit()
我按照allyourcode
的建议,替换了前两个循环,并且还使用了executemany()
,这和˜unutbu
的解决方案一样。不过和˜unutbu
不同的是,我使用了生成器表达式来传递参数,这样可以节省内存,尽管使用列表推导式稍微快一点。最后,这个方法比书中建议的做法快了100倍:
>>> cProfile.run("crawler.calculatepagerank4()")
33512 function calls in 1.377 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.004 0.004 1.377 1.377 <string>:1(<module>)
2 0.000 0.000 0.073 0.036 searchengine.py:27(dbcommit)
1 0.693 0.693 1.373 1.373 searchengine.py:286(calculatepagerank4
10432 0.011 0.000 0.011 0.000 searchengine.py:321(<genexpr>)
23065 0.009 0.000 0.009 0.000 {method 'append' of 'list' objects}
2 0.073 0.036 0.073 0.036 {method 'commit' of 'sqlite3.Connectio
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
6 0.379 0.063 0.379 0.063 {method 'execute' of 'sqlite3.Connecti
1 0.209 0.209 0.220 0.220 {method 'executemany' of 'sqlite3.Conn
1 0.000 0.000 0.000 0.000 {range}
还需要注意以下几个问题:
- 如果你在构建SQL语句时使用字符串格式化
%f
而不是使用占位符?
,你会失去精度(例如,我用?
得到的是2.9796095721920315,但用%f
得到的是2.9796100000000001)。 - 在默认的PageRank算法中,从一个页面到另一个页面的重复链接只算作一个链接。然而书中的解决方案没有考虑到这一点。
- 书中的整个算法是有缺陷的:原因是,在每次迭代中,pagerank分数没有存储在第二个表中。这意味着每次迭代的结果依赖于遍历页面的顺序,而这可能会在多次迭代后大幅改变结果。要解决这个问题,可以使用一个额外的表或字典来存储下一次迭代的pagerank,或者使用完全不同的算法,比如幂迭代。