Python HTTPConnectionPool未能建立新连接：[Errno 11004]getaddrinfo失败

import requests from urllib.parse import urlencode import json from bs4 import BeautifulSoup import re from html.parser import HTMLParser from multiprocessing import Pool from requests.exceptions import RequestException import time def get_page_index(offset, keyword): #headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'} data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 1 } url = 'http://www.toutiao.com/search_content/?' + urlencode(data) try: response = requests.get(url, headers={'Connection': 'close'}) response.encoding = 'utf-8' if response.status_code == 200: return response.text return None except RequestException as e: print(e) def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): url = item.get('article_url') if url and len(url) < 100: yield url def get_page_detail(url): try: response = requests.get(url, headers={'Connection': 'close'}) response.encoding = 'utf-8' if response.status_code == 200: return response.text return None except RequestException as e: print(e) def parse_page_detail(html): soup = BeautifulSoup(html, 'lxml') title = soup.select('title')[0].get_text() pattern = re.compile(r'articleInfo: (.*?)},', re.S) pattern_abstract = re.compile(r'abstract: (.*?)\.', re.S) res = re.search(pattern, html) res_abstract = re.search(pattern_abstract, html) if res and res_abstract: data = res.group(1).replace(r".replace(/<br \/>|\n|\r/ig, '')", "") + '}' abstract = res_abstract.group(1).replace(r"'", "") content = re.search(r'content: (.*?),', data).group(1) source = re.search(r'source: (.*?),', data).group(1) time_pattern = re.compile(r'time: (.*?)}', re.S) date = re.search(time_pattern, data).group(1) date_today = time.strftime('%Y-%m-%d') img = re.findall(r'src="(.*?)&quot', content) if date[1:11] == date_today and len(content) > 50 and img: return { 'title': title, 'content': content, 'source': source, 'date': date, 'abstract': abstract, 'img': img[0] } def main(offset): flag = 1 html = get_page_index(offset, '光伏') for url in parse_page_index(html): html = get_page_detail(url) if html: data = parse_page_detail(html) if data: html_parser = HTMLParser() cwl = html_parser.unescape(data.get('content')) data['content'] = cwl print(data) print(data.get('img')) flag += 1 if flag == 5: break if __name__ == '__main__': pool = Pool() pool.map(main, [i*20 for i in range(10)])

HTTPConnectionPool(host='tech.jinghua.cn', port=80): Max retries exceeded with url: /zixun/20160720/f191549.shtml (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000048523C8>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))

2条回答

网友

1楼 · 编辑于 2024-04-26 23:44:11

当我面对这个问题时，我有以下问题

我不能做以下事情 -请求python模块无法从任何url获取信息。虽然我可以用浏览器浏览网站，也可以让wget或curl下载该页面。 -pip安装也不工作，使用失败，并出现以下错误

Failed to establish a new connection: [Errno 11004] getaddrinfo failed

某个站点阻止了我，所以我尝试forcebindip为我的python模块使用另一个网络接口，然后我删除了它。这可能导致我的网络混乱，我的请求模块，甚至直接套接字模块被卡住，无法获取任何url。

所以我在下面的URL中遵循了网络配置重置，现在我很好。

network configuration reset

网友

2楼 · 编辑于 2024-04-26 23:44:11

在我看来，你在HTTPConnectionPool中达到了连接的极限。因为你同时启动了10个线程

尝试下列操作之一：

增加请求超时（秒）：requests.get('url', timeout=5)
关闭响应：Response.close()。不返回response.text，而是将response赋给variable，关闭response，然后返回variable

相关问题更多 >

编程相关推荐

热门问题

热门文章