如何在使用deathbycaptcha服务处理Google reCAPTCHA v2时控制Scrapy中的请求流？

CONCURRENT_REQUESTS = 1 DEPTH_LIMIT = 1 DOWNLOAD_DELAY = 30 CONCURRENT_REQUESTS_PER_DOMAIN = 1 CONCURRENT_REQUESTS_PER_IP = 1 DOWNLOAD_TIMEOUT = 240 AUTOTHROTTLE_ENABLED = True # The initial download delay AUTOTHROTTLE_START_DELAY = 10 # The maximum download delay to be set in case of high latencies AUTOTHROTTLE_MAX_DELAY = 60

import sys import os sys.path.append(r'F:\Documents\ScrapyDirectory\scrapername\scrapername\spiders') import deathbycaptcha import json import scrapy import requests from datetime import datetime import math import urllib import time from scrapy_splash import SplashRequest from threading import Timer from timeit import Timer class scrapername(scrapy.Spider): name = "scrapername" start_urls = [] global scrapeUrlList global charCompStorage global captchaIsRunning r = requests.get('http://example.com/examplejsonfeed.php') myObject = json.loads(r.text) #print("Loading names...") for o in myObject['objects']: #a huge function for creating basically a lot of objects and appending links created from these objects to the scrapeUrlList function print(len(scrapeUrlList)) for url in scrapeUrlList: start_urls.append(url[1]) #add all those urls that just got created to the start_urls list link_collection = [] def resetCaptchaInformation(): global captchaIsRunning if captchaIsRunning: captchaIsRunning = False def afterCaptchaSubmit(self, response): global captchaIsRunning print("Captcha submitted: " + response.request.url) captchaIsRunning = False def parse(self, response): global captchaIsRunning self.logger.info("got response %s for %r" % (response.status, response.url)) if "InternalCaptcha" in response.request.url: #checks for captcha in the url and if it's there it starts running the captcha solver API if not captchaIsRunning: #I have this statement here as a deterrent to prevent the captcha solver from starting again and again and #again with every new request (which it does) *ISSUE 1* if "captchasubmit" in response.request.url: print("Found captcha submit in url") else: print("Internal Captcha is activated") captchaIsRunning = True t = Timer(240.0, self.resetCaptchaInformation) #so I have been having major issues here not sure why? #*ISSUE 2* t.start() username = "username" password = "password" print("Set username and password") Captcha_dict = { 'googlekey': '6LcMUhgUAAAAAPn2MfvqN9KYxj7KVut-oCG2oCoK', 'pageurl': response.request.url} print("Created catpcha dict") json_Captcha = json.dumps(Captcha_dict) print("json.dumps on captcha dict:") print(json_Captcha) client = deathbycaptcha.SocketClient(username, password) print("Set up client with deathbycaptcha socket client") try: print("Trying to solve captcha") balance = client.get_balance() print("Remaining Balance: " + str(balance)) # Put your CAPTCHA type and Json payload here: captcha = client.decode(type=4,token_params=json_Captcha) if captcha: # The CAPTCHA was solved; captcha["captcha"] item holds its # numeric ID, and captcha["text"] item its a text token". print("CAPTCHA %s solved: %s" % (captcha["captcha"], captcha["text"])) data = { 'g-recaptcha-response':captcha["text"], } try: dest = response.xpath("/html/body/form/@action").extract_first() print("Form URL: " + dest) submitURL = "https://exampleaddress.com" + dest yield scrapy.FormRequest(url=submitURL, formdata=data, callback=self.afterCaptchaSubmit, dont_filter = True) print("Yielded form request") if '': # check if the CAPTCHA was incorrectly solved client.report(captcha["captcha"]) except TypeError: sys.exit() except deathbycaptcha.AccessDeniedException: # Access to DBC API denied, check your credentials and/or balance print("error: Access to DBC API denied, check your credentials and/or balance") else: pass else: print("no Captcha") #this will run if no captcha is on the page that the redirect landed on #and basically parses all the information on the page

2018-07-19 14:10:35 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dThomas%2520Garrett%26citystatezip%3dLas%2520Vegas%2c%2520Nv> from <GET https://www.exampleaddress.com/results?name=Thomas%20Garrett&citystatezip=Las%20Vegas,%20Nv> 2018-07-19 14:10:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dThomas%2520Garrett%26citystatezip%3dLas%2520Vegas%2c%2520Nv> (referer: None) 2018-07-19 14:10:49 [scrapername] INFO: got response 200 for 'https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dThomas%2520Garrett%26citystatezip%3dLas%2520Vegas%2c%2520Nv' Internal Captcha is activated 2018-07-19 14:10:49 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dThomas%2520Garrett%26citystatezip%3dLas%2520Vegas%2c%2520Nv> (referer: None) Traceback (most recent call last): File "F:\Program Files (x86)\Anaconda3\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback yield next(it) File "F:\Program Files (x86)\Anaconda3\lib\site-packages\scrapy_splash\middleware.py", line 156, in process_spider_output for el in result: File "F:\Program Files (x86)\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output for x in result: File "F:\Program Files (x86)\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr> return (_set_referer(r) for r in result or ()) File "F:\Program Files (x86)\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Program Files (x86)\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Documents\ScrapyDirectory\scraperName\scraperName\spiders\scraperName- Copy.py", line 232, in parse t = Timer(240.0, self.resetCaptchaInformation) File "F:\Program Files (x86)\Anaconda3\lib\timeit.py", line 130, in __init__ raise ValueError("stmt is neither a string nor callable") ValueError: stmt is neither a string nor callable 2018-07-19 14:10:53 [scrapy.extensions.logstats] INFO: Crawled 63 pages (at 2 pages/min), scraped 13 items (at 0 items/min) 2018-07-19 14:11:02 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dSamuel%2520Van%2520Cleave%26citystatezip%3dLas%2520Vegas%2c%2520Nv> from <GET https://www.exampleaddress.com/results?name=Samuel%20Van%20Cleave&citystatezip=Las%20Vegas,%20Nv> 2018-07-19 14:11:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dSamuel%2520Van%2520Cleave%26citystatezip%3dLas%2520Vegas%2c%2520Nv> (referer: None) 2018-07-19 14:11:13 [scrapername] INFO: got response 200 for 'https://www.exampleaddress.com/InternalCaptcha?returnUrl=%2fresults%3fname%3dSamuel%2520Van%2520Cleave%26citystatezip%3dLas%2520Vegas%2c%2520Nv' #and then an endless supply of 302 redirects, and 200 response for their crawl #nothing happens, because the Timer failed, the captcha never solved? #I'm not sure what is going wrong with it, hence the issues I am having

1条回答

网友
1楼 · 发布于 2024-04-26 12:38:31

我仍然需要几周的经验来解决您的问题，但我会尝试使用RetryMiddleware：
可能在重试代码中添加302就足够了：
RETRY_HTTP_CODES Default: [500, 502, 503, 504, 408]
https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#retry-http-codes

相关问题更多 >

编程相关推荐

热门问题

热门文章