如何使用爬行器动态设置“启动URL”

Traceback (most recent call last): File "/home/ec2-user/venv/lib64/python3.7/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks result = g.send(result) File "/home/ec2-user/venv/lib64/python3.7/site-packages/scrapy/crawler.py", line 88, in crawl start_requests = iter(self.spider.start_requests()) TypeError: 'NoneType' object is not iterable

class StorelistSpider(CrawlSpider): name = "crawler" allowed_domains = ["example.com"] def start_requests(self): #Target Category with open('CategoryList.txt') as f1: for q1 in f1: targetCategory = q1 #Target Prefecture with open('prefectureList.txt') as f2: for q2 in f2: prefectureName = q2 start_urls=("https://example.com/" + q2 + "/") #rules to follow links: rules = ( #follow area link first, then category link next, check list pages and go to the details Rule(LinkExtractor( allow=r"/\w+/A\d{4}/$", restrict_xpaths = "//*[@id='js-leftnavi-area-scroll']", unique = True,)), Rule(LinkExtractor( allow=r"/\w+/A\d{4}/rstLst/" + "{}".format(targetCategory) + r"/$", restrict_xpaths = "//*[@id='js-leftnavi-genre-balloon']", unique = True,)), Rule(LinkExtractor( allow=r"/\w+/A\d{4}/rstLst/" + "{}".format(targetCategory) + r"/\d*/$", restrict_xpaths = "//*[@id='container']/div[15]/div[4]/div/div[7]/div/ul", unique = True,)), Rule(LinkExtractor( allow=r"/\w+/A\d{4}/A\d{6}/\d+/$", restrict_xpaths = "//*[@id='container']/div[15]/div[4]/div/div[6]", unique = True, ), callback="page_parse"), ) def page_parse(self, response): yield Page.from_response(response)

1条回答

网友

1楼 · 发布于 2024-04-25 23:26:32

start_requests方法必须返回迭代器。请参阅https://docs.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.start_requests

另外，rules必须声明为类属性。请参阅https://docs.scrapy.org/en/latest/topics/spiders.html#crawlspider-example

因此，您的示例应该如下所示：

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class StorelistSpider(CrawlSpider):
    name = "crawler"

    allowed_domains = ["example.com"]

    #rules to follow links:
    rules = (
        #follow area link first, then category link next, check list pages and go to the details
        Rule(LinkExtractor(
            allow=r"/\w+/A\d{4}/$",
            restrict_xpaths = "//*[@id='js-leftnavi-area-scroll']",
            unique = True,)),
        Rule(LinkExtractor(
            allow=r"/\w+/A\d{4}/rstLst/" + "{}".format(targetCategory) + r"/$",
            restrict_xpaths = "//*[@id='js-leftnavi-genre-balloon']",
            unique = True,)),
        Rule(LinkExtractor(
            allow=r"/\w+/A\d{4}/rstLst/" + "{}".format(targetCategory) + r"/\d*/$",
            restrict_xpaths = "//*[@id='container']/div[15]/div[4]/div/div[7]/div/ul",
            unique = True,)),
        Rule(LinkExtractor(
            allow=r"/\w+/A\d{4}/A\d{6}/\d+/$",
            restrict_xpaths = "//*[@id='container']/div[15]/div[4]/div/div[6]",
            unique = True,
        ), callback="page_parse"),
    )

    def start_requests(self):
        #Target Category
        with open('CategoryList.txt') as f1:
            for q1 in f1:
                targetCategory = q1

                #Target Prefecture
                with open('prefectureList.txt') as f2:
                    for q2 in f2:
                        prefectureName = q2
                        url = "https://example.com/" + q2 + "/"
                        yield scrapy.Request(url, callback=self.parse_page)


    def page_parse(self, response):
        yield Page.from_response(response)

我还没有测试，但这应该可以工作（编辑：还需要更新规则以删除targetCategory变量。您可以静态编写这些规则，或者在__init__方法上构建它。）

相关问题更多 >

编程相关推荐

热门问题

热门文章