如何在网络爬虫设置超时?

2024-05-16 23:46:56 发布

您现在位置:Python中文网/ 问答频道 /正文

我对python非常陌生,正在尝试开发非常简单的web爬虫。我的网络爬虫工作很好,但它坚持一个链接很长时间。如何设置超时函数?在

如何处理urllib2.HTTPError?我的except语句正确吗?在

def get_link(page):
    start = page.find('<a href=')
    if start==-1:
        return None,0
    startp=page.find('"',start)
    endp=page.find('"',startp+1)
    url=page[startp+1:endp]
    return url,endp

def get_all_link(page):
    allurl = []
    while True:
        url,endp=get_link(page)
        if url:
            page=page[endp:]
            allurl.append(url)
        else:
            return allurl
            break

def get_page(page, tocrawl):
    import urllib2
    try:
        page_source = urllib2.urlopen(page)
        return page_source.read()
    except:
        page = tocrawl.pop()
        raise

def validate(page):
    valid = page.find('http')

    if valid == -1:
       return 0
    return 1


def crawler(seed):
    tocrawl = [seed]
    crawled = []
    i=0

    while tocrawl:
        page=tocrawl.pop()
        valid = validate(page)
        if valid:
            if page not in crawled:
                tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
                crawled.append(page)
                i=i+1
                f = open("crawled.txt","a")
                f.write(repr(i)+" : "+repr(page)+"\n")
                f.close()
    return crawled

crawler("http://google.com")

Tags: urlgetreturnifdefpagelinkfind