加速HTTP请求python和500

import requests import re from bs4 import BeautifulSoup import csv URL = 'http://www.gulf-times.com/AdvanceSearchNews.aspx?Pageindex={index}&keywordtitle={query}&keywordbrief={query}&keywordbody={query}&category=&timeframe=&datefrom={datefrom}&dateTo={dateto}&isTimeFrame=0' def run(**params): countryFile = open("EgyptDaybyDay.csv","a") i=1 results = True while results: params["index"]=str(i) response = requests.get(URL.format(**params)) print response.status_code htmlFile = BeautifulSoup(response.content) articles = htmlFile.findAll("div", { "class" : "newslist" }) for article in articles: url = (article.a['href']).encode('utf-8','ignore') title = (article.img['alt']).encode('utf-8','ignore') dateline = article.find("div",{"class": "floatright"}) m = re.search("([0-9]{2}\-[0-9]{2}\-[0-9]{4})", dateline.string) date = m.group(1) w = csv.writer(countryFile,delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL) w.writerow((date, title, url )) if not articles: results = False i+=1 countryFile.close() run(query="Egypt", datefrom="12-01-2010", dateto="12-01-2011")

3条回答

网友

1楼 · 编辑于 2024-06-16 11:34:22

在我看来，你在找一份报纸没有登广告的饲料。然而，这是一个以前已经解决过的问题——有很多站点会为您生成任意网站的提要，从而至少解决了您的一个问题。其中一些需要一些人的指导，而另一些人调整的机会更少，而且更自动。在

如果您可以避免自己进行分页和解析，我建议您这样做。如果不能，为了简单起见，我支持使用gevent。也就是说，如果他们把你的代码发送回500，你的代码就不会有什么问题，增加并行性也不会有帮助。在

网友

2楼 · 编辑于 2024-06-16 11:34:22

这是一个尝试gevent的好机会。在

你应该为请求.get使应用程序不必等待IO阻塞。在

然后，您可以生成多个worker，并使用队列来传递请求和文章。也许是类似的事情：

import gevent.monkey
from gevent.queue import Queue
from gevent import sleep
gevent.monkey.patch_all()

MAX_REQUESTS = 10

requests = Queue(MAX_REQUESTS)
articles = Queue()

mock_responses = range(100)
mock_responses.reverse()

def request():
    print "worker started"
    while True:
        print "request %s" % requests.get()
        sleep(1)

        try:
            articles.put('article response %s' % mock_responses.pop())
        except IndexError:
            articles.put(StopIteration)
            break

def run():
    print "run"

    i = 1
    while True:
        requests.put(i)
        i += 1

if __name__ == '__main__':
    for worker in range(MAX_REQUESTS):
        gevent.spawn(request)

    gevent.spawn(run)
    for article in articles:
        print "Got article: %s" % article

网友

3楼 · 编辑于 2024-06-16 11:34:22

最慢的可能是服务器，因此并行化http请求将是提高代码运行速度的最佳方法，尽管要加快服务器响应速度几乎无能为力。在IBM上有一个很好的教程，教你如何做到这一点

相关问题更多 >

编程相关推荐

热门问题

热门文章