带Python的刮网Trulia

from selenium import webdriver from selenium.webdriver.remote import webelement from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC import pandas as pd import time from bs4 import BeautifulSoup import os from datetime import datetime from selenium.webdriver import ActionChains driver = webdriver.Firefox(executable_path = 'C:\\Users\\Downloads\\geckodriver-v0.24.0-win64\\geckodriver.exe') def get_trulia_estimate(address): driver.get('https://www.trulia.com/') print(address) element = (By.ID, 'homepageSearchBoxTextInput') WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click() WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).send_keys(address) search_button = (By.CSS_SELECTOR, "button[data-auto-test-id='searchButton']") WebDriverWait(driver, 50).until(EC.element_to_be_clickable(search_button)).click() time.sleep(3) soup = BeautifulSoup(driver.page_source, 'html.parser') results = soup.find('div', {'class', 'Text__TextBase-sc-1cait9d-0 OmRik'}) print(results) get_trulia_estimate('693 Bluebird Canyon Drive, Laguna Beach, CA 92651')

3条回答

网友

1楼 · 编辑于 2024-05-15 11:53:30

使用beautifulsoup的版本：

import requests
from bs4 import BeautifulSoup

url = 'https://www.trulia.com/json/search/location/?query={}&searchType=for_sale'
search_string = '693 Bluebird Canyon Drive, Laguna Beach, CA 92651'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}

d = requests.get(url.format(search_string), headers=headers).json()
property_url = 'https://www.trulia.com' + d['url']

soup = BeautifulSoup(requests.get(property_url, headers=headers).text, 'lxml')
print(soup.select_one('h3:has(+div span:contains("Trulia Estimate"))').text)

印刷品：

$1,735,031

CSS选择器h3:has(+div span:contains("Trulia Estimate"))发现<h3>的标签<div>包含<span>和字符串“Trulia Estimate”作为直接同级

进一步阅读：

CSS Selectors Reference

网友

2楼 · 编辑于 2024-05-15 11:53:30

看起来每次都会生成CSS

我建议对此使用XPATH

使用.text获取文本

您可能希望更改为具有价格的父元素。。。因此，使用(//div[@aria-label="Price trends are based on the Trulia Estimate"])[1]//../h3/div作为xpath

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from datetime import datetime
from selenium.webdriver import ActionChains

driver = webdriver.Firefox(executable_path = 'geckodriver.exe')
def get_trulia_estimate(address):
    driver.get('https://www.trulia.com/')
    print(address)
    element = (By.ID, 'homepageSearchBoxTextInput')
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).send_keys(address)
    search_button = (By.CSS_SELECTOR, "button[data-auto-test-id='searchButton']")
    WebDriverWait(driver, 50).until(EC.element_to_be_clickable(search_button)).click()
    time.sleep(3)
    find_trulia_estimate_text = driver.find_element_by_xpath('(//div[@aria-label="Price trends are based on the Trulia Estimate"])[1]').text
    print(find_trulia_estimate_text)

get_trulia_estimate('693 Bluebird Canyon Drive, Laguna Beach, CA 92651')

693 Bluebird Canyon Drive, Laguna Beach, CA 92651 Trulia Estimate

如果使用价格的xpath，则输出为：

693 Bluebird Canyon Drive, Laguna Beach, CA 92651 $1,735,031

网友

3楼 · 编辑于 2024-05-15 11:53:30

如果你想尝试不带美女的生活

     if driver.find_element_by_xpath("//span[contains(text(),'Trulia Estimate')]").is_displayed():
           estimate = driver.find_element_by_xpath("//div[@data-testid='home-details-summary']//h3/div')]").text
        else:
           print("Estimate is not found")

print(estimate)

相关问题更多 >

编程相关推荐

热门问题

热门文章