多核执行碎片

# -*- coding: utf-8 -*- import scrapy import csv from scrapy import FormRequest from scrapy import Request from scrapy.loader import ItemLoader from bricomarche.items import Product from datetime import date from scrapy.loader.processors import TakeFirst CATEGORIES = ['http://www.bricomarche.com/l/nos-produits/bricolage/outillage-et-equipement-de-l-atelier/outillage-electroportatif/perceuse-sans-fil-visseuse-accessoire-87.html?limit=90&p=1&solr_is_local=1', 'http://www.bricomarche.com/l/nos-produits/bricolage/outillage-et-equipement-de-l-atelier/outillage-electroportatif/perceuse-perforateur-et-marteau-piqueur-88.html?limit=90&p=1&solr_is_local=1', 'http://www.bricomarche.com/l/nos-produits/bricolage/outillage-et-equipement-de-l-atelier/outillage-electroportatif/meuleuse-rainureuse-accessoire-85.html?limit=90&p=1&solr_is_local=1'] class BricoMarcheSpider(scrapy.Spider): name = 'brico_marche' def start_requests(self): # full path with open('file.csv') as csvfile: reader = csv.DictReader(csvfile) for i, row in enumerate(reader): magasin_id = row['Id'] if row['Id'][0] == '0': magasin_id = row['Id'][1:] formdata = {'city' : row['City'], 'market' : row['Brand'], 'idPdv' : magasin_id} yield FormRequest(url='http://www.bricomarche.com/bma_popin/Geolocalisation/choisirMagasin', formdata=formdata, dont_filter=True, callback=self.parse, meta={'cookiejar': i}) def parse(self, response): for url in CATEGORIES: yield Request(url=url, dont_filter=True, callback=self.parse_category, meta={'cookiejar': response.meta['cookiejar']}) def parse_category(self, response): pos = response.xpath('//div[@class="store-details"]/p/strong/text()').extract_first() if pos: for url in response.xpath('//a[@class="view-product"]/@href').extract(): yield Request(url=url, dont_filter=True, callback=self.parse_product, meta={'cookiejar': response.meta['cookiejar'], 'pos' : pos.strip()}) next_page = response.xpath('//a[@title="Suivant"]/@href').extract_first() if next_page is not None: yield Request(url=next_page, callback=self.parse_category, dont_filter=True, meta={'cookiejar':response.meta['cookiejar'], 'pos' : pos.strip()}) def parse_product(self, response): l = ItemLoader(item=Product(), response=response) l.default_output_processor = TakeFirst() l.add_value('id_source', 'BRMRCH_FR') l.add_value('extract_date', str(date.today())) l.add_value('pos_name', response.meta['pos']) l.add_xpath('brand_seller', '//td[@itemprop="brand"]/text()') l.add_xpath('price_vat', '//span[contains(@class,"new-price")]/text()') categories = response.xpath('//li[@itemprop="itemListElement"]//span[@itemprop="name"]/text()').extract() # setting categories and family # check with category which has fewer categories to verify values try: l.add_value('prod_name', categories[-1]) l.add_value('prod_family', categories[-2]) l.add_value('prod_category1', categories[0]) l.add_value('prod_category2', categories[1]) l.add_value('prod_category3', categories[2]) l.add_value('prod_category4', categories[3]) except: pass l.add_xpath('sku_seller', '//div[@class="content-fiche-produit"]/ul/li/p/text()') # Réserver en magasin existing_stock = response.xpath('//script[contains(text(),"STOCK_PDV")]').extract() # Produit disponible en magasin text product_available =response.xpath('//span[@class="product_avaliable"]').extract() if existing_stock: l.add_value('inventory', existing_stock) l.add_value('available_yn', '1') if product_available: l.add_value('available_yn', '1') l.add_value('inventory', response.xpath('//div[@class="bg-white"]/p/text()').extract_first()) else: l.add_value('available_yn', '0') l.add_xpath('available_pos_status', '//div[@class="fiche-items"]/div/p/text()') l.add_xpath('available_pos_date', '//div[@class="fiche-items"]/div/p/text()') return l.load_item()

1条回答

网友

1楼 · 发布于 2024-04-20 09:26:03

最好的方法是使用scrapyd。在

文档中关于Distributed crawls的大部分建议也可以应用于在一台机器上运行，除非您将在同一个scrapyd服务器上多次运行spider。在

If you instead want to run a single (big) spider through many machines, what you usually do is partition the urls to crawl and send them to each separate spider. Here is a concrete example:
First, you prepare the list of urls to crawl and put them into separate files/urls:
http://somedomain.com/urls-to-crawl/spider1/part1.list
http://somedomain.com/urls-to-crawl/spider1/part2.list
http://somedomain.com/urls-to-crawl/spider1/part3.list
Then you fire a spider run on 3 different Scrapyd servers. The spider would receive a (spider) argument part with the number of the partition to crawl:
curl http://scrapy1.mycompany.com:6800/schedule.json -d project=myproject -d spider=spider1 -d part=1
curl http://scrapy2.mycompany.com:6800/schedule.json -d project=myproject -d spider=spider1 -d part=2
curl http://scrapy3.mycompany.com:6800/schedule.json -d project=myproject -d spider=spider1 -d part=3

相关问题更多 >

编程相关推荐

热门问题

热门文章