from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class WebsiteSpider(Spider):
name = 'my_spider'
allowed_domains = ['example1.com','example2.com']
start_urls = ['https://www.example1.com','https://www.example2.com']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lstA = doc.listA(
url=url["url"]) # Get link data for subsequent crawling
data = [{"title": doc.title.text}] # Get target data
return {"Urls": lstA, "Data": data} # Return data to framework
SimplifiedMain.startThread(WebsiteSpider())
试试这个
相关问题 更多 >
编程相关推荐