如何让Selenium在新选项卡中单击link,解析HTML,并返回一些指定的span标记?

2024-05-16 03:10:18 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在制作一个Craigslist刮板来刮取标题、价格、日期和网址,并将这些信息导出到CSV。现在,我希望Selenium单击post URL导航到实际页面,解析页面以获得span标记“Odometer”(获取里程数),并将其返回到CSV文件。在

以下是我目前为止的代码:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule

from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd




class CraigslistScaper(object): 
    def __init__(self,query,location,max_price,transmission): 
        self.query = query
#        self.sort=sort
        self.location = location
#        self.postal = postal
        self.max_price = max_price
        self.transmission = auto_transmission


#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
        self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
        self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
        self.delay = 5

    def load_craigslist_url(self): 
        self.driver.get(self.url)
        try:
            wait = WebDriverWait(self.driver, self.delay)
            wait.until(EC.presence_of_element_located((By.ID,"searchform")))              
            print("page is ready")
        except TimeoutError: 
            print('Loading took too much time')

#extracting the post information such as titles, dates, and prices    
    def extract_post_information(self): 
        all_posts = self.driver.find_elements_by_class_name('result-row')
        titles = []
        dates = []
        prices = []

        post_info_list = []

        for i in range(len(all_posts)): 
            post = all_posts[i]
            title = post.text.split('$')

            if title[0] == '':
                title = title[1]
            else:
                title = title[0]

            title = title.split("\n")
            price = title[0]
            title = title[-1]

            title = title.split(' ')
            month = title[0]
            day = title[1]
            date = month + " " + day
            title = ' '.join(title[2:])

            #print('PRICE: ' + (price))

            #print('TITLE: ' + (title))
            #print('DATE: ' + date)  

            lst = [price, title, date]
            post_info_list.append(lst)

        #f=open("miata_prices.csv", "a+")
        #f.write(post_info_list)

        #print(post_info_list)

        #df = pd.DataFrame(post_info_list)
        #df.to_csv('miata_prices.csv', index=False, header=False)
        print(post_info_list)
        return post_info_list        

    def save_post_info_and_urls_to_csv(self, post_info, post_urls):
        for i in range(len(post_info)):
            post_info[i].append(post_urls[i])
        #print(post_info)
        df = pd.DataFrame(post_info)
        df.to_csv('miata_prices.csv', index=False, header=False)
        return post_info

#extracting post URLs    
    def extract_post_urls(self): 
        url_list = []
        soup = BeautifulSoup(self.driver.page_source,'html.parser')
        aTagsInLi = self.driver.find_elements_by_css_selector('li a')
        self.driver.find_elements_by_css_selector('li a')[0].click()
        for a in aTagsInLi:
           link = a.get_attribute('href')
           print(link)
        link = self.driver.find_element_by_link_text('Miata')
        print(link)
        link.click()
        for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
            #print(link.get('href'))
            url_list.append(link.get('href'))

            return url_list
    #to click on URL Links and parse the HTML

    def click_next_page(self): 
        href = driver.find_element_by_partial_link_text("result-title hdrlink")
        extract_post_urls(url_list).click(href)




    def quit(self): 
        self.driver.close()

location = "sfbay" 
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"

scraper = CraigslistScaper(query,location,max_price,auto_transmission)        

scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()

我设法把所有的东西都放到CSV文件中,但我却被困在如何让Selenium打开新选项卡中的每个链接,获取里程表信息,然后关闭选项卡。在

我用这个来构建一个数据集,并最终用它做一些分析!在


Tags: csvimportselfinfotitledriverlinkquery
1条回答
网友
1楼 · 发布于 2024-05-16 03:10:18

我有一个示例,如何让Selenium打开每个链接并获取里程表信息。我使用了Selenium的包装器(SeElements)来实现更少的代码。我希望你能知道它是怎么工作的。所以:

我打开你的链接,取消从标题到列表的所有链接。然后打开每个链接,尝试获取里程表信息。

from elementium.drivers.se import SeElements
from selenium import webdriver


browser = webdriver.Chrome()

url = 'https://sfbay.craigslist.org/search/ctaquery=mazda+miata&sort=rel&max_price=6000&auto_transmission=1'
browser.get(url)
se = SeElements(browser)
titles = se.xpath('//p[@class="result-info"]/a', wait=True, ttl=5)
try:
    links = []
    for link in titles:
        links.append(link.attribute('href'))
    for link in links:
        print(link)
        browser.get(link)
        try:
            odometer = se.xpath('//span[contains(text(), "odometer")]',wait=True, ttl=2).text()
        except Exception:
            continue
        print(odometer)
except Exception as e:
    browser.quit()
    raise e

相关问题 更多 >