我正在制作一个Craigslist刮板来刮取标题、价格、日期和网址,并将这些信息导出到CSV。现在,我希望Selenium单击post URL导航到实际页面,解析页面以获得span标记“Odometer”(获取里程数),并将其返回到CSV文件。在
以下是我目前为止的代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
#extracting the post information such as titles, dates, and prices
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
post_info_list = []
for i in range(len(all_posts)):
post = all_posts[i]
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
#print('PRICE: ' + (price))
#print('TITLE: ' + (title))
#print('DATE: ' + date)
lst = [price, title, date]
post_info_list.append(lst)
#f=open("miata_prices.csv", "a+")
#f.write(post_info_list)
#print(post_info_list)
#df = pd.DataFrame(post_info_list)
#df.to_csv('miata_prices.csv', index=False, header=False)
print(post_info_list)
return post_info_list
def save_post_info_and_urls_to_csv(self, post_info, post_urls):
for i in range(len(post_info)):
post_info[i].append(post_urls[i])
#print(post_info)
df = pd.DataFrame(post_info)
df.to_csv('miata_prices.csv', index=False, header=False)
return post_info
#extracting post URLs
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
aTagsInLi = self.driver.find_elements_by_css_selector('li a')
self.driver.find_elements_by_css_selector('li a')[0].click()
for a in aTagsInLi:
link = a.get_attribute('href')
print(link)
link = self.driver.find_element_by_link_text('Miata')
print(link)
link.click()
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
#print(link.get('href'))
url_list.append(link.get('href'))
return url_list
#to click on URL Links and parse the HTML
def click_next_page(self):
href = driver.find_element_by_partial_link_text("result-title hdrlink")
extract_post_urls(url_list).click(href)
def quit(self):
self.driver.close()
location = "sfbay"
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()
我设法把所有的东西都放到CSV文件中,但我却被困在如何让Selenium打开新选项卡中的每个链接,获取里程表信息,然后关闭选项卡。在
我用这个来构建一个数据集,并最终用它做一些分析!在
我有一个示例,如何让Selenium打开每个链接并获取里程表信息。我使用了Selenium的包装器(SeElements)来实现更少的代码。我希望你能知道它是怎么工作的。所以:
我打开你的链接,取消从标题到列表的所有链接。然后打开每个链接,尝试获取里程表信息。
相关问题 更多 >
编程相关推荐