从ajax中提取数据

import scrapy class UseragentSpider(scrapy.Spider): name = 'useragent' allowed_domains = ['scrapingclub.com/exercise/ajaxdetail_header/'] start_urls = ['https://scrapingclub.com/exercise/ajaxdetail_header/'] user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36" def parse(self, response): cardb= response.xpath("//div[@class='card-body']") for thing in cardb: title= thing.xpath(".//h3") yield {'title' : title}

2020-09-07 20:34:39 [scrapy.core.engine] INFO: Spider opened 2020-09-07 20:34:39 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2020-09-07 20:34:39 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023 2020-09-07 20:34:40 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None) 2020-09-07 20:34:40 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://scrapingclub.com/exercise/ajaxdetail_header/> (referer: None) 2020-09-07 20:34:40 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://scrapingclub.com/exercise/ajaxdetail_header/>: HTTP status code is not handled or not allowed

1条回答

网友

1楼 · 发布于 2024-05-29 00:04:41

AJAX请求应发送标头

 'X-Requested-With': 'XMLHttpRequest'

但并非所有服务器都会检查它。但是这个服务器检查一下。但是它不检查User-Agent

服务器以JSON的形式发送数据，因此xpath将是无用的

我用requests而不是scrapy测试它，因为它对我来说更简单

import requests

headers = {
    #'User-Agent': 'Mozilla/5.0',
    'X-Requested-With': 'XMLHttpRequest',
}

url = 'https://scrapingclub.com/exercise/ajaxdetail_header/'

response = requests.get(url, headers=headers)
data = response.json()

print(data)
print('type:', type(data))
print('keys:', data.keys())
print(' - manually  -')
print('price:', data['price'])
print('title:', data['title'])
print(' - for-loop  -')
for key, value in data.items():
    print('{}: {}'.format(key, value))

结果:

{'img_path': '/static/img/00959-A.jpg', 'price': '$24.99', 'description': 'Blouse in airy, crinkled fabric with a printed pattern. Small stand-up collar, concealed buttons at front, and flounces at front. Long sleeves with buttons at cuffs. Rounded hem. 100% polyester. Machine wash cold.', 'title': 'Crinkled Flounced Blouse'}
type: <class 'dict'>
keys: dict_keys(['img_path', 'price', 'description', 'title'])
 - manually  -
price: $24.99
title: Crinkled Flounced Blouse
 - for-loop  -
img_path: /static/img/00959-A.jpg
price: $24.99
description: Blouse in airy, crinkled fabric with a printed pattern. Small stand-up collar, concealed buttons at front, and flounces at front. Long sleeves with buttons at cuffs. Rounded hem. 100% polyester. Machine wash cold.
title: Crinkled Flounced Blouse

编辑：

与Scrapy相同。我使用函数start_requests()创建带有头'X-Requested-With'的Request()

您可以将所有代码放在一个文件中并运行python script.py，而无需创建项目

import scrapy
import json

class MySpider(scrapy.Spider):

    name = 'myspider'

    def start_requests(self):
        url = 'https://scrapingclub.com/exercise/ajaxdetail_header/'

        headers = {
            #'User-Agent': 'Mozilla/5.0',
            'X-Requested-With': 'XMLHttpRequest',
        }

        yield scrapy.http.Request(url, headers=headers)
        
    def parse(self, response):
        print('url:', response.url)

        data = response.json()

        print(data)
        print('type:', type(data))
        print('keys:', data.keys())
        print(' - manually  -')
        print('price:', data['price'])
        print('title:', data['title'])
        print(' - for-loop  -')
        for key, value in data.items():
            print('{}: {}'.format(key, value))

#  - run without project and save in `output.csv`  -

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    #'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    #'FEED_FORMAT': 'csv',     # csv, json, xml
    #'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()

编辑：

使用设置DEFAULT_REQUEST_HEADERS也一样

import scrapy
import json

class MySpider(scrapy.Spider):

    name = 'myspider'

    start_urls = ['https://scrapingclub.com/exercise/ajaxdetail_header/']

    def parse(self, response):
        print('url:', response.url)
        #print('headers:', response.request.headers)
        
        data = response.json()

        print(data)
        print('type:', type(data))
        print('keys:', data.keys())
        print(' - manually  -')
        print('price:', data['price'])
        print('title:', data['title'])
        print(' - for-loop  -')
        for key, value in data.items():
            print('{}: {}'.format(key, value))

#  - run without project and save in `output.csv`  -

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    #'USER_AGENT': 'Mozilla/5.0',
    'DEFAULT_REQUEST_HEADERS': {
            #'User-Agent': 'Mozilla/5.0',
            'X-Requested-With': 'XMLHttpRequest',
        }
    # save in file CSV, JSON or XML
    #'FEED_FORMAT': 'csv',     # csv, json, xml
    #'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()

相关问题更多 >

编程相关推荐

热门问题

热门文章