简单的网络爬虫

def get_senate_vote(vote): URL = 'https://www.govtrack.us/data/congress/113/votes/2013/s%d/data.json' % vote response = requests.get(URL) json_data = json.loads(response.text) return json_data def get_all_votes(): all_senate_votes = [] URL = "http://www.govtrack.us/data/congress/113/votes/2013" response = requests.get(URL) root = html.fromstring(response.content) for a in root.xpath('/html/body/pre/a'): link = a.xpath('text()')[0].strip() if link[0] == 's': vote = int(link[1:-1]) try: vote_json = get_senate_vote(vote) except: return all_senate_votes all_senate_votes.append(vote_json) return all_senate_votes vote_data = get_all_votes()

2条回答

网友

1楼 · 编辑于 2024-04-19 02:10:20

这是一个相当简单的代码示例，我计算了每次调用所需的时间。在我的系统中，它平均每个请求接受2 secs，并且有582个页面要访问，所以大约{}而不将JSON打印到控制台。可能会增加你的网络打印时间。在

#!/usr/bin/python

import requests
import re
import time
def find_votes():
    r=requests.get("https://www.govtrack.us/data/congress/113/votes/2013/")
    data = r.text
    votes = re.findall('s\d+',data)
    return votes

def crawl_data(votes):
    print("Total pages: "+str(len(votes)))
    for x in votes:
        url ='https://www.govtrack.us/data/congress/113/votes/2013/'+x+'/data.json'
        t1=time.time()
        r=requests.get(url)
        json = r.json()
        print(time.time()-t1)
crawl_data(find_votes())

网友

2楼 · 编辑于 2024-04-19 02:10:20

如果您使用的是python 3.x，并且您正在抓取多个站点，为了获得更好的性能，我热情地建议您使用实现asynchronous原则的aiohttp模块。例如：

import aiohttp
import asyncio

sites = ['url_1', 'url_2']
results = []

def save_reponse(result):
    site_content = result.result()
    results.append(site_content)

async def crawl_site(site):
    async with aiohttp.ClientSession() as session:
        async with session.get(site) as resp:
            resp = await resp.text()
            return resp

tasks = []
for site in sites:
    task = asyncio.ensure_future(crawl_site(site))
    task.add_done_callback(save_reponse)
    tasks.append(task)
all_tasks = asyncio.gather(*tasks)

loop = asyncio.get_event_loop()
loop.run_until_complete(all_tasks)
loop.close()

print(results)

有关aiohttp的更多信息。在

相关问题更多 >

编程相关推荐

热门问题

热门文章