擅长:python、mysql、java
<p>解析每个页面后,还可以使用链接提取器来提取所有链接。</p>
<p>链接提取器将为您过滤链接。在本例中,链接提取器将拒绝允许域中的链接,因此它只获取外部链接。</p>
<pre><code>from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LxmlLinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item = someItem()
item['url'] = link.url
</code></pre>