如何从网站上抓取图片？

#!/usr/bin/python import [urllib2][1] from bs4 import BeautifulSoup from urlparse import urljoin url = "http://www.theft-alerts.com/index-%d.html" page = urllib2.urlopen(url).read() soup = BeautifulSoup(page, "html.parser") base = "http://www.theft-alerts.com" images = [urljoin(base,a["href"]) for a in soup.select("td a[href^=images/]")] for url in images: img = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find("img")["src"] with open("myimages/{}".format(img), "w") as f: f.write(urllib2.urlopen("{}/{}".format(url.rsplit("/", 1)[0], img)).read())

1条回答

网友

1楼 · 发布于 2024-06-06 13:16:45

您需要遍历每一页并提取图像，您可以一直循环，直到文本"Next"的锚点位于类resultnav的代码标记中：

import  requests

from bs4 import BeautifulSoup
from urlparse import urljoin

def get_pages(start):
    soup = BeautifulSoup(requests.get(start).content)
    images = [img["src"] for img in soup.select("div.itemspacingmodified a img")]
    yield  images
    nxt = soup.select("code.resultnav a")[-1]
    while True:
        soup = BeautifulSoup(requests.get(urljoin(url, nxt["href"])).content)
        nxt = soup.select("code.resultnav a")[-1]
        if nxt.text != "Next":
            break
        yield [img["src"] for img in soup.select("div.itemspacingmodified a img")]




url = "http://www.theft-alerts.com/"

for images in get_pages(url):
    print(images)

这给了你所有19页的图片。在

相关问题更多 >

编程相关推荐

热门问题

热门文章