如何阻止我的爬虫记录重复？

from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy.item import Item, Field class MyItem(Item): url=Field() class someSpider(CrawlSpider): name = "My script" domain=raw_input("Enter the domain:\n") allowed_domains = [domain] starting_url=raw_input("Enter the starting url with protocol:\n") start_urls = [starting_url] f=open("items.txt","w") rules = (Rule(LxmlLinkExtractor(allow_domains=(domain)), callback='parse_obj', follow=True),) def parse_obj(self,response): for link in LxmlLinkExtractor(allow_domains=(self.domain)).extract_links(response): item = MyItem() item['url'] = link.url self.f.write(item['url']+"\n")

1条回答

网友

1楼 · 发布于 2024-04-26 22:15:59

创建一个已经访问过的URL列表并检查每个URL。所以在解析特定的URL之后，将其添加到列表中。在访问新找到的URL上的页面之前，请检查该URL是否已在该列表中，并对其进行分析、添加或跳过。你知道吗

即：

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field

class MyItem(Item):
    url=Field()

class someSpider(CrawlSpider):
  name = "My script"
  domain=raw_input("Enter the domain:\n")
  allowed_domains = [domain]
  starting_url=raw_input("Enter the starting url with protocol:\n")
  start_urls = [starting_url]
  items=[] #list with your URLs
  f=open("items.txt","w")

  rules = (Rule(LxmlLinkExtractor(allow_domains=(domain)), callback='parse_obj', follow=True),)


  def parse_obj(self,response):
    for link in LxmlLinkExtractor(allow_domains=(self.domain)).extract_links(response):
        if link not in self.items: #check if it's already parsed
            self.items.append(link)   #add to list if it's not parsed yet
            #do your job on adding it to a file
            item = MyItem()
            item['url'] = link.url
            self.f.write(item['url']+"\n")

词典版本：

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field

class MyItem(Item):
    url=Field()

class someSpider(CrawlSpider):
  name = "My script"
  domain=raw_input("Enter the domain:\n")
  allowed_domains = [domain]
  starting_url=raw_input("Enter the starting url with protocol:\n")
  start_urls = [starting_url]
  items={} #dictionary with your URLs as keys
  f=open("items.txt","w")

  rules = (Rule(LxmlLinkExtractor(allow_domains=(domain)), callback='parse_obj', follow=True),)


  def parse_obj(self,response):
    for link in LxmlLinkExtractor(allow_domains=(self.domain)).extract_links(response):
        if link not in self.items: #check if it's already parsed
            self.items[link]=1  #add to dictionary as key if it's not parsed yet (stored value can be anything)
            #do your job on adding it to a file
            item = MyItem()
            item['url'] = link.url
            self.f.write(item['url']+"\n")

另外，您还可以先收集items，然后将其写入文件。你知道吗

对这段代码还有许多其他的改进，但我把它留给你去研究。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章