我按下结束键,使它进入页面的末尾。然后一个接一个地按up键直到找到: enter image description here
它工作得很好,但似乎不再工作了
options.add_argument('window-size=1200x600')
prefs = {}
prefs = {"profile.default_content_setting_values.geolocation": 2, "profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
d = webdriver.Chrome(chrome_options=options,
executable_path='./chromedriver')
d.get(features["ad_url"])
# Use send_keys(Keys.HOME) to scroll up to the top of page
d.find_element_by_tag_name('body').send_keys(
Keys.END)
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
下面是一段功能完整的代码供您尝试:
import json
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from datetime import datetime
from selenium.webdriver.common.keys import Keys
import pickle
import time
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml','https://www.arabam.com/sitemap/otomobil_2.xml',
'https://www.arabam.com/sitemap/otomobil_3.xml','https://www.arabam.com/sitemap/otomobil_4.xml',
'https://www.arabam.com/sitemap/otomobil_5.xml','https://www.arabam.com/sitemap/otomobil_6.xml',
'https://www.arabam.com/sitemap/otomobil_7.xml','https://www.arabam.com/sitemap/otomobil_8.xml',
'https://www.arabam.com/sitemap/otomobil_9.xml','https://www.arabam.com/sitemap/otomobil_10.xml',
'https://www.arabam.com/sitemap/otomobil_11.xml','https://www.arabam.com/sitemap/otomobil_12.xml',
'https://www.arabam.com/sitemap/otomobil_13.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI': "arabam_"+str(datetime.today().strftime('%d%m%y'))+'.csv'
}
def parse(self,response):
for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a"):
link = td.xpath("@href").extract()
year = td.xpath("text()").extract()
self.crawled.append(link[0])
self.new_links += 1
if int(year[0]) > 2010:
url = "https://www.arabam.com/" + link[0]
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
features = {}
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument('window-size=1200x600')
prefs = {}
prefs = {"profile.default_content_setting_values.geolocation": 2, "profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
d = webdriver.Chrome(chrome_options=options,
executable_path='./chromedriver')
d.get(features["ad_url"])
# Use send_keys(Keys.HOME) to scroll up to the top of page
d.find_element_by_tag_name('body').send_keys(
Keys.END)
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
overview1 = e.text.split("\n")
yield features
process = CrawlerProcess({
})
process.crawl(Myspider)
process.start() # the script wi
编辑: 我对代码进行了注释并运行了代码,结果显示密钥正在被发送。问题是试图找到特定的div。我试着在上面加上try catch,但似乎不起作用
尽管如此:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
try:
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
except:
pass
编辑:
这就是我向上滚动所做的。但不幸的是,这在大多数情况下并不适用
for i in range(0,37):
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div[2]/div")
overview1 = e.text.split("\n")
编辑: 试过这个。它滚动到视图中,但不获取元素
e = d.find_element_by_xpath("//div[@id = 'js-hook-appendable-technicalPropertiesWrapper' and @class = 'cf' ] ")
actions = ActionChains(d)
actions.move_to_element(e).perform()
wait = WebDriverWait(d, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[@id = 'js-hook-appendable-technicalPropertiesWrapper' and @class = 'cf' ]")))
overview1 = e.text.split("\n")
编辑: HTML截图 enter image description here
添加作为答案,因为这是一个有点冗长的评论
首先,您需要等待元素出现。然后找到元素并提取值。从代码中,元素查找在可见性检查之前完成。 您可以尝试的另一件事是在提取值之前滚动到特定元素。此特定表似乎仅在视口中加载值
相关问题 更多 >
编程相关推荐