刮下的图像是正确的

# os import os # http request import requests # import pprint import time # import html from lxml from lxml import html # global global_page_num = 0 pp = pprint.PrettyPrinter(indent=4) # write to file def download_image(img_urls): # total img urls amount = len(img_urls) # loop for index, value in enumerate(img_urls, start=0): # file name filename = 'img/%s.jpg' % (index) # dir os.makedirs(os.path.dirname(filename), exist_ok=True) print('--- start ---') print('filename: %s' % filename) print('Downloading: %s out of %s' % (index, amount)) # open file with open(filename, 'wb') as f: # f write # time.sleep(1) f.write(requests.get(value).content) def get_page_number(num): url = 'http://digg.com' response = requests.get(url).content selector = html.fromstring(response) img_urls = [] img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") news_texts = [] news_texts = selector.xpath("//div[@itemprop='description']/text()") # test # print('--- something ---') # pp.pprint(img_urls) # pp.pprint(news_texts) download_image(img_urls) return img_urls if __name__ == '__main__': # input, page_number, everything into the var # page_number = input('Please enter the page number that you want to scrape:') # global_page_num # global_page_num = page_number; # print('hell world!'); page_number = 4 # hardcode get_page_number(page_number)

1条回答

网友

1楼 · 发布于 2024-05-19 01:37:53

图像“损坏”的原因是页面中的方案发生了变化，图像开始“隐藏”在属性data-src中，而不是用代码获取内容的src。请参见以下带有两个属性的抓取页面的源代码示例：

<img
class="digg-story__image-img js digg-story__image-img lazy-image-img need-offset"
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
width="312"
height="170"
alt=""
/>

换句话说，在创建图像URL列表时，必须同时检查属性src和data-src赋予data-src优先权。你知道吗

此代码执行“技巧”并下载正确的图像：

# os
import os
# http request
import requests
#
import pprint

import time

# import html from lxml
from lxml import html

# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)

# write to file
def download_image(img_urls):
    # total img urls
    amount = len(img_urls)

    # loop
    for index, value in enumerate(img_urls, start=0):
        # file name
        filename = 'img/%s.jpg' % (index)
        # dir
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        print(' - start  -')
        print('filename: %s' % filename)
        print('Downloading: %s out of %s' % (index, amount))

        # open file
        with open(filename, 'wb') as f:
            # f write
            # time.sleep(1)
            f.write(requests.get(value).content)


def get_page_number(num):
    url = 'http://digg.com'
    response = requests.get(url).content
    selector = html.fromstring(response)

    img_urls = []
    img_urls_1a = selector.xpath("//div[@class='digg-story__image thumb']/a/img/@src")
    img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
    img_urls_2 = selector.xpath("//div[@class='digg-story__image thumb']/a/img/@data-src")
    img_urls = img_urls_1b + img_urls_2
    # print(img_urls)
    news_texts = []
    news_texts = selector.xpath("//div[@itemprop='description']/text()")

    # test
    # print(' - something  -')
    # pp.pprint(img_urls)
    # pp.pprint(news_texts)

    download_image(img_urls)

    return img_urls


if __name__ == '__main__':
    # input, page_number, everything into the var
    # page_number = input('Please enter the page number that you want to scrape:')

    # global_page_num
    # global_page_num = page_number;
    # print('hell world!');

    page_number = 4 # hardcode
    get_page_number(page_number)

相关问题更多 >

编程相关推荐

热门问题

热门文章