import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider1(scrapy.Spider):
# Your first spider definition
...
class MySpider2(scrapy.Spider):
# Your second spider definition
...
process = CrawlerProcess()
process.crawl(MySpider1)
process.crawl(MySpider2)
process.start() # the script will block here until all crawling jobs are finished
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
setting = get_project_settings()
process = CrawlerProcess(setting)
for spider_name in process.spiders.list():
print ("Running spider %s" % (spider_name))
process.crawl(spider_name,query="dvh") #query dvh is custom argument used in your scrapy
process.start()
from scrapy import spiderloader
from scrapy.utils import project
from twisted.internet.defer import inlineCallbacks
@inlineCallbacks
def crawl():
settings = project.get_project_settings()
spider_loader = spiderloader.SpiderLoader.from_settings(settings)
spiders = spider_loader.list()
classes = [spider_loader.load(name) for name in spiders]
for my_spider in classes:
yield runner.crawl(my_spider)
reactor.stop()
crawl()
reactor.run()
我想你要找的是这样的东西:
您可以在running-multiple-spiders-in-the-same-process上阅读更多内容。
或者您可以这样运行,您需要使用scrapy.cfg(我的scrapy版本是1.3.3)将此代码保存在同一目录中:
更好的解决方案是(如果有多个蜘蛛),它动态地获取蜘蛛并运行它们。
(第二种解决方案): 因为
spiders.list()
在Scrapy 1.4yuda解决方案中被弃用,所以应该将其转换为相关问题 更多 >
编程相关推荐