Python requests.get和不同服务器的线程

import requests, time from bs4 import BeautifulSoup from threading import Thread from random import choice # Enable to get some logging info #--------------------------------- # import logging # import http.client # http.client.HTTPConnection.debuglevel = 1 # logging.basicConfig() # logging.getLogger().setLevel(logging.DEBUG) # requests_log = logging.getLogger("requests.packages.urllib3") # requests_log.setLevel(logging.DEBUG) # requests_log.propagate = True sites = [ "https://pikabu.ru/community/blackhumour", "https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot" ] class Pikabu_Downloader(Thread): def __init__(self, url, name, *args, **kwargs): super().__init__(*args, **kwargs) self.url = url self.name = name self.begin = time.time() def run(self): print("Beginning with thread number",self.name, ",", round(time.time()-self.begin, 4), " seconds has passed") html_data = self._get_html() print("After requests.get with thread number", self.name, ",", round(time.time()-self.begin, 4), " seconds has passed") if html_data is None: return self.soup = BeautifulSoup(html_data, "html.parser") print("After making soup with thread number", self.name, ",", round(time.time() - self.begin, 4), " seconds has passed") def _get_html(self): try: user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36') print(f"Go {self.url}...") res = requests.get(self.url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False) except Exception as exc: print(exc) else: return res.text test = "https://readingbooks.site/read/?name=1984&" def download(): pikabu_urls = [] for url in sites: pikabu = [url + "?page=" + str(x) for x in range(1, 10)] pikabu_urls = pikabu_urls + pikabu pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(pikabu_urls)] # Comment the string above and enable 2 underlying strings to get result from test server # tests = [test + "page=" + str(x) for x in range(1, pages)] # pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(tests)] for pikabu_dl in pikabu_dls: pikabu_dl.start() for pikabu_dl in pikabu_dls: pikabu_dl.join() download()

... After requests.get with thread number 1 , 1.6904 seconds has passed After making soup with thread number 1 , 1.7554 seconds has passed After requests.get with thread number 2 , 2.9805 seconds has passed After making soup with thread number 2 , 3.0455 seconds has passed After requests.get with thread number 3 , 4.3225 seconds has passed After making soup with thread number 3 , 4.3895 seconds has passed ...

1条回答

网友
1楼 · 发布于 2024-05-16 09:46:09

在我调查了您的案例之后，我会指出您遇到的一些问题：
在执行并行任务时不要打印，这会在渲染到屏幕的过程中造成瓶颈
大量的任务并不总是对性能有好处，这取决于您的内存处理量。假设你有1000个链接，你必须创建1000个任务对象？否，仅通过利用ThreadPool放置5-20的固定器
在接受请求时，服务器也是一个需要处理的问题。下载大小，低带宽，网络，距离，。。延迟响应将影响您的物理机器。你的站点很重，每个请求似乎要消耗1-3000毫秒，所以当你用小尺寸（20个链接）测试它时，你会觉得它是按顺序运行的
您的代码是并行运行的，因为您做了一些小技巧来将其放在不同的线程上，这并不完全正确，因为我们需要一个完全异步的库，比如asyncio和aiohttp。aiohttp将处理Coroutine上的大量异步请求，而asyncio将支持语法并在主线程上操作
我在colab上做了一个小实验，请注意，由于卡滞，我没有在colab上使用asyncio和aiohttp，但我以前在几个项目上实现过，它比低于最快的方法工作得更快
第二个功能是您的实现

import urllib.request from threading import Thread import time, requests from random import choice user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36') timeout = 5 sites = [ "https://pikabu.ru/community/blackhumour", "https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot" ] URLS = [] for url in sites: pikabu = [url + "?page=" + str(x) for x in range(25)] URLS.extend(pikabu) def convert_to_threads(): return [Thread(target=load_url, args=(page, timeout)) for page in URLS] def running_threads(): threads = convert_to_threads() start = time.time() for i in threads: i.start() for i in threads: i.join() print(f'Finish with {len(URLS)} requests {time.time() - start}') def load_url(url, timeout): res = requests.get(url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False) return res.text def running_sequence(): start = time.time() for url in URLS: load_url(url, timeout) print(f'Finish with {len(URLS)} requests {time.time() - start}') def running_thread_pool(): start = time.time() # We can use a with statement to ensure threads are cleaned up promptly with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor: # Start the load operations and mark each future with its URL future_to_url = {executor.submit(load_url, url, timeout): url for url in URLS} for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: data = future.result() except Exception as exc: print('%r generated an exception: %s' % (url, exc)) # else: # print('%r page is %d length' % (url, len(data))) print(f'Finish with {len(URLS)} requests {time.time() - start}')
简而言之，我建议您使用ThreadPool（在colab中更可取）或asyncio和aiohttp（不在colab中）来提高速度

相关问题更多 >

编程相关推荐

热门问题

热门文章