在python中处于非活动状态后重新启动函数

def main(): pass if __name__ == '__main__': main() import urllib2 import re import time def get_next_target(page): start_link = page.find('<tr onclick=' + '"openurl(') if start_link == -1: return None, 0 start_quote = page.find("'", start_link) end_quote = page.find("</tr", start_quote + 1) url = page[start_quote + 1:end_quote] url = "http://www.kipa.co.il/" + url return url, end_quote def get_all_links(page): links = [] while True: url, endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def split_qa(qa): splitfrom = qa.find('show_banner') split = qa.find("s", splitfrom) q = qa[0:split] split = qa.find(");", splitfrom) a = qa[split + 2:] return q, a def clear_line(page): newpage = '' add = 'yes' for extract in page: if extract == '<': add = 'no' if add == 'yes': newpage = newpage + extract if extract == '>': add = 'yes' q, a = split_qa(newpage) return q, a def main(): pass if __name__ == '__main__': main() import urllib2 import re def get_next_target(page): start_link = page.find('<tr onclick=' + '"openurl(') if start_link == -1: return None, 0 start_quote = page.find("'", start_link) end_quote = page.find("</tr", start_quote + 1) url = page[start_quote + 1:end_quote] url = "http://www.kipa.co.il/" + url return url, end_quote def get_all_links(page): links = [] while True: url, endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def split_qa(qa): splitfrom = qa.find('show_banner') split = qa.find("s", splitfrom) q = qa[0:split] split = qa.find(");", splitfrom) a = qa[split + 2:] return q, a def clear_line(page): newpage = '' add = 'yes' for extract in page: if extract == '<': add = 'no' if add == 'yes': newpage = newpage + extract if extract == '>': add = 'yes' q, a = split_qa(newpage) return q, a def get_content(url): response = urllib2.urlopen(url) page = response.read() page = page.decode('utf-8') start_link = page.find('<p class="padd10">') if start_link == -1: return None, 0 start_quote = page.find("<strong>", start_link) end_quote = page.find('<p class="padd10 ravName">', start_quote + 1) content = page[start_quote:end_quote] q, a = clear_line(content) return q, a import xlsxwriter print('where to start?') i = int(raw_input()) for sheet in range(i,6760): workbook = xlsxwriter.Workbook('kipa' + str(sheet) + '.xlsx') worksheet = workbook.add_worksheet() bold = workbook.add_format({'bold': 1}) worksheet.set_column('A:A', 20) worksheet.set_column('A:C', 10) worksheet.set_column('A:D', 30) worksheet.set_column('A:E', 30) worksheet.set_column('A:F', 30) worksheet.write('A1', 'Link', bold) worksheet.write('B1', 'Date', bold) worksheet.write('C1', 'Rabbi', bold) worksheet.write('D1', 'Title', bold) worksheet.write('E1', 'Qestion', bold) worksheet.write('F1', 'Answer', bold) xlplace = 0 qa_page = "http://www.kipa.co.il/ask/page/" + str(i) i = i + 1 response = urllib2.urlopen(qa_page) page_source = response.read() page_source = page_source.decode('utf-8') biglist = get_all_links(page_source) qnumeber = 1 for extract in biglist: xlplace = xlplace + 1 end_quote = extract.find("'", 0) url = extract[0:end_quote] worksheet.write(xlplace, 0, url) start_link = extract.find('<td') start_quote = extract.find(">", start_link) end_quote = extract.find("</td>", start_quote + 1) date = extract[start_quote + 1:end_quote] worksheet.write(xlplace, 1, date) start_link = extract.find('<td', end_quote) start_quote = extract.find(">", start_link) end_quote = extract.find("</td>", start_quote + 1) rabbi = extract[start_quote + 1:end_quote] worksheet.write(xlplace, 2, rabbi) start_link = extract.find('">', end_quote) start_quote = extract.find(">", start_link) end_quote = extract.find("<", start_quote + 1) title = extract[start_quote + 1:end_quote] worksheet.write(xlplace, 3, title) q, a = get_content(url) worksheet.write(xlplace, 4, q) worksheet.write(xlplace, 5, a) print(qnumeber) qnumeber = qnumeber + 1 print(qa_page) workbook.close()

1条回答

网友

1楼 · 发布于 2024-04-24 06:09:00

我认为您需要的是为通过urllib2发出的http请求设置一个超时。你知道吗

您可以将请求设置为：

response = urllib2.urlopen(url, timeout=30)  # Set a time out for 30 seconds or 60 for a minute.

如果要多次调用并尝试打开特定的url，可以使用以下命令：

def try_url_open(url, timeout=5, times=1):
"""
Try to open url, and if fails on timeout error. It tries again <times> times.  
"""
response = None
while times != 0:
    times -= 1 
    try:
        response = urllib2.urlopen(url, timeout=timeout)
    except socket.timeout, error:
        # Timeout error here, so try again until variable time reach value 0.
        continue
return response

相关问题更多 >

编程相关推荐

热门问题

热门文章