使用python从网站获取所有url

#!/usr/bin/python import urllib2 import urlparse from BeautifulSoup import BeautifulSoup def getAllUrl(url): page = urllib2.urlopen( url ).read() urlList = [] try: soup = BeautifulSoup(page) soup.prettify() for anchor in soup.findAll('a', href=True): if not 'http://' in anchor['href']: if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList: urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href'])) else: if anchor['href'] not in urlList: urlList.append(anchor['href']) length = len(urlList) for url in urlList: getAllUrl(url) return urlList except urllib2.HTTPError, e: print e if __name__ == "__main__": urls = getAllUrl('http://bobthemac.com') for x in urls: print x

#!/usr/bin/python import urllib2 import urlparse from BeautifulSoup import BeautifulSoup def getAllUrl(url): urlList = [] try: page = urllib2.urlopen( url ).read() soup = BeautifulSoup(page) soup.prettify() for anchor in soup.findAll('a', href=True): if not 'http://' in anchor['href']: if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList: urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href'])) else: if anchor['href'] not in urlList: urlList.append(anchor['href']) return urlList except urllib2.HTTPError, e: urlList.append( e ) if __name__ == "__main__": urls = getAllUrl('http://bobthemac.com') fullList = [] for x in urls: listUrls = list listUrls = getAllUrl(x) try: for i in listUrls: if not i in fullList: fullList.append(i) except TypeError, e: print 'Woops wrong content passed' for i in fullList: print i

2条回答

网友

1楼 · 编辑于 2024-04-23 07:25:27

我认为这是可行的：

#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup

def getAllUrl(url):
    try:
        page = urllib2.urlopen( url ).read()
    except:
        return []
    urlList = []
    try:
        soup = BeautifulSoup(page)
        soup.prettify()
        for anchor in soup.findAll('a', href=True):
            if not 'http://' in anchor['href']:
                if urlparse.urljoin(url, anchor['href']) not in urlList:
                    urlList.append(urlparse.urljoin(url, anchor['href']))
            else:
                if anchor['href'] not in urlList:
                    urlList.append(anchor['href'])

        length = len(urlList)

        return urlList
    except urllib2.HTTPError, e:
        print e

def listAllUrl(urls):
    for x in urls:
        print x
        urls.remove(x)
        urls_tmp = getAllUrl(x)
        for y in urls_tmp:
            urls.append(y)


if __name__ == "__main__":
    urls = ['http://bobthemac.com']
    while(urls.count>0):
        urls = getAllUrl('http://bobthemac.com')
        listAllUrl(urls)

网友

2楼 · 编辑于 2024-04-23 07:25:27

在函数getAllUrl中，在for循环中再次调用getAllUrl，它将进行递归。

元素一旦放入urlList就永远不会被移出，因此urlList永远不会为空，然后，递归就永远不会中断。

这就是为什么你的程序永远不会以util内存不足而告终。

相关问题更多 >

编程相关推荐

热门问题

热门文章