如何使我网页爬取脚本更强壮？

def hw_santander_scrap(Amount, Duration): from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--start-maximized') chrome_options.add_argument('window-size=10000x5000') webdriver = webdriver.Chrome('chromedriver', chrome_options = chrome_options) # import time maintenant = DT.now() period = str(maintenant.day) + '_' + str(maintenant.month) + '_' + str(maintenant.year) print('Start Scraping') ################################################ Santander############################################### Santander = pd.DataFrame({ 'Project': "reforma vivienda", 'Period': period, 'Monthly repayment': [0], 'TIN': [0], 'TAE': [0], 'Total repayment': [0], 'Initial amount': [0], 'Duration': [0] }) project = pd.DataFrame({ 'Project': "reforma vivienda", 'Period': period, 'Monthly repayment': [0], 'TIN': [0], 'TAE': [0], 'Total repayment': [0], 'Initial amount': [0], 'Duration': [0] }) url = 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0' webdriver.get(url) Max_amount = 90.000 Min_amount = 3.000 for i in range(len(Amount)): Simulated_amount = Amount[i] if Simulated_amount > Max_amount: pass elif Simulated_amount < Min_amount: pass else : amount = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#amount"))) amount.clear() amount.send_keys("{:.3f}".format(Simulated_amount)) WebDriverWait(webdriver, 30).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0) for j in range(len(Duration)): Simulated_duration = Duration[j] Simulated_duration = round(int(Simulated_duration)) Max_duration = 96 Min_duration = 12 if Simulated_duration > Max_duration: pass elif Simulated_duration < Min_duration: pass else : term = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#term"))) term.clear() term.send_keys("{}".format(Simulated_duration)) term.send_keys(Keys.TAB) webdriver.save_screenshot('screenshot_santander.png') project.loc[j, 'Project'] = "reforma vivienda" project.loc[j, 'Initial amount'] = float("{:.3f}".format(Amount[i]).replace('.', '')) project.loc[j, 'Duration'] = Simulated_duration project.loc[j, 'Period'] = str(maintenant.day) + '/' + str(maintenant.month) + '/' + str(maintenant.year) project.loc[j, 'Monthly repayment'] = webdriver.find_element_by_css_selector('.r1 span').text.replace(' €', '').replace(',', '.') project.loc[j, 'TIN'] = float(webdriver.find_element_by_css_selector('.r3 span').text[6: 10].replace(',', '.')) project.loc[j, 'TAE'] = float(webdriver.find_element_by_css_selector('.r3 span').text[13: 17].replace(',', '.')) project.loc[j, 'Total repayment'] = float(webdriver.find_element_by_css_selector('.r7 span').text.replace(' €', '').replace('.', '').replace(',', '.')) Santander = Santander.append(project) Santander = Santander.loc[Santander.TIN != 0,: ] Santander.to_csv('Santander_{}.csv'.format(period), index = False) print('End Scraping')

2条回答

网友

1楼 · 编辑于 2024-04-26 21:02:20

这是我闪耀的时刻！你知道吗

信息：

我目前正在开发一个金融数据聚合器，它也面临着同样的问题。你知道吗

它从十几个网站收集数据，并将其组织成一个JSON对象，然后由Flask站点使用该对象来显示数据。你知道吗

这些数据是从具有多个子目录的网站中获取的，这些子目录具有相似的内容，并且具有不同的选择器。你知道吗

您可以想象，对于selenium这样的框架，这会变得非常复杂，因此唯一的解决方案就是将其哑下来。你知道吗

解决方案：

Simplicity is key，所以我删除了除BeautifulSoup和requests库之外的所有依赖项。你知道吗

然后我为每个filter^[1]创建了三个类和一个函数

from bs4 import BeautifulSoup

class GET:
  def text(soup, selector, index = 0):
    selected = soup.select(selector)
    if len(selected) > index:
      return selected[index].text.strip()

class Parse:
  def common(soup, selector):
    return GET.text(soup, selector, index = 5)

class Routes:
  def main(self):
    data = {}
    if self.is_dir_1:
      data["name"] = GET.text(self.soup, "div")
      data["title-data"] = Parse.common(self.soup, "p > div:nth-child(1)")
    elif self.is_dir_2:
      data["name"] = GET.text(self.soup, "p", index = 2)
      data["title-data"] = Parse.common(self.soup, "p > div:nth-child(5)")
    return data

def filter_name(url: str, response: str, filter_type: str):
  if hasattr(Routes, filter_type):
    return getattr(Routes, filter_type)(to_object({
      "is_dir_1": bool("/sub_dir_1/" in url),
      "is_dir_2": bool("/sub_dir_1/" in url),
      "soup": BeautifulSoup(html, "lxml")
    }))
  return {}

我使用requests库发出获取数据的请求，然后将URL、响应文本和filter类型传递给filter_name函数。你知道吗

然后在filter_name函数中，我使用filter_type参数将“soup”传递给目标route函数，然后选择每个元素并在那里获取数据。你知道吗

然后在targetroute函数中，我使用if条件来确定子目录，并将文本分配给数据对象。你知道吗

完成所有这些之后，我返回了data对象。你知道吗

这个方法非常简单，并且保持了代码的干性，它甚至允许可选的key: value对。你知道吗

以下是to_object助手类的代码：

class to_object(object):
  def __init__(self, dictionary):
    self.__dict__ = dictionary

这会将字典转换为对象，因此不必总是编写：

self["soup"]

你会写：

self.soup

修复错误：

您确实需要标准化所使用的缩进类型，因为脚本会引发以下错误：

Traceback (most recent call last):
  File "", line 84
    Amount =   [13.000, 14.000, 15.000, 30.000, 45.000, 60.000]
    ^
IndentationError: unindent does not match any outer indentation level

注：

过滤器是脚本，刮不同的网站，我的项目需要我刮几个网站，以获得所需的数据。你知道吗
尽量多整理你的代码，整理代码更容易阅读和编写

我希望这有帮助，祝你好运。你知道吗

网友

2楼 · 编辑于 2024-04-26 21:02:20

这些数据来自XHR。因此，只需使用请求发布您的值并用json.loads解析响应即可

使用浏览器的“网络”选项卡查看请求的外观。你知道吗

信息：

解决方案：

修复错误：

注：

相关问题更多 >

编程相关推荐

热门问题

热门文章