扫描图像的URL模式？

import scrapy from scrapy.contrib.spiders import Rule, CrawlSpider from scrapy.contrib.linkextractors import LinkExtractor from example.items import ExampleItem class exampleSpider(CrawlSpider): name = 'example' allowed_domains = ['example.com'] start_urls = ['http://a.example.com/2vZBkE'] #rules = [Rule(LinkExtractor(allow=['/.*']),'parse_example')] rules = (Rule(SgmlLinkExtractor(allow=('\/%s\/.*',)), callback='parse_example'), ) def parse_example(self,response): image = ExampleItem() image['title']=response.xpath(\ "//h5[@id='image-title']/text()").extract() rel = response.xpath("//img/@src").extract() image ['image_urls'] = ['http:'+rel[0]] return image

2条回答

网友

1楼 · 编辑于 2024-04-19 23:11:19

提取链接，如 href=“a。example.com/123456.jpg“

使用以下正则表达式：

“=\”（\S+/[\w\d]{6}.jpg）

网友

2楼 · 编辑于 2024-04-19 23:11:19

我不知道刮痧。但是你可以用requests和itertools来做

from string import ascii_letters, digits
from itertools import product
import requests

# You need to implement this function to download images
# check this http://stackoverflow.com/questions/13137817
def download_image(url):
    print url  

def check_link(url):
    r = requests.head(url)
    if r.status_code == 200:
        return True
    return False

# Check all possible combinations and check them
def generate_links():
    base_url = "http://a.example.com/{}.jpg"
    for combination in product(ascii_letters+digits, repeat=5):
        url = base_url.format("".join(combination))
        if check_link(url):
            download_image(url)

相关问题更多 >

编程相关推荐

热门问题

热门文章