比较上一个创建的文件和下一个文件的记录:Python Scraping BS4

2024-04-19 17:00:48 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在用Selenium和BS4清除这个Platform,我能够检索到我想要的所有信息。问题是平台包含传感器读数,并且很少有传感器不是每天更新的,所以我想将上一个刮取文件的记录与下一个要创建的文件进行比较

例如:

我今天创建了包含200条记录的xyz.csv文件,明天我将执行代码以收集信息,但在创建新文件之前,我希望检查与上次创建的文件的重复。添加执行将每天运行,所以在我看来检查最后创建的文件将是好的,但我愿意接受建议

我使用以下代码来抓取信息:

# -*- coding utf-8 -*-

from selenium.webdriver.firefox.options import Options
from selenium import webdriver
import time
import os
import shutil
from bs4 import BeautifulSoup
import uuid
import csv
import dateutil.parser as parser
import pandas as pd
import re
from datetime import datetime as dt
from time import gmtime, strftime, localtime
from selenium.webdriver.support.select import Select

class crawlHydro():
   
    def __init__(self):
        print("hurray33")
        global downloadDir
        global uFileName
        global filname
        downloadDir = ""
        uFileName = str(uuid.uuid4())
        filname = downloadDir + uFileName + ".csv"
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 1000)
        # Set Firefox preferences for headless crawling
        fp = webdriver.FirefoxProfile()
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference("browser.download.manager.showWhenStarting", False)
        fp.set_preference("browser.download.dir", downloadDir)
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                          "attachment/csv")
        options = Options()
        options.add_argument("--headless")
        # Initialize webdriver and target URL
        self.driver = webdriver.Firefox(firefox_profile=fp, firefox_options=options)
        #self.driver = webdriver.Firefox()
        print("hurray")
        self.driver.implicitly_wait(15)
        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")
        self.verificationErrors = []
        self.accept_next_alert = True

    def crawl(self):
        print("see")
        driver = self.driver
        # Finds elements available on the target page for interaction/action
        driver.execute_script("window.scrollTo(0, 800)")
        driver.find_element_by_id("dijit_MenuItem_3_text").click()
        driver.find_element_by_xpath('//td[.="All"]').click()
        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()
        soup = BeautifulSoup(driver.page_source, 'html.parser') #Extract page content using BS4
        headers = []
        valueArr = 'LastReadTime'
        for m in soup.find_all("th"):
            headers.append(m.get_text())
        headers.append(valueArr)
        #print(headers)
        new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')]  for i in soup.find_all('table')[5::]] #Extract table content i.e. column and rows
        new_data = [[dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%M+00') if re.match("\d{2}-\d{2}-\d{4}\s\d{2}:\d{2}",i) else i for i in m] for m in new_data] # Convert time to ISO 8601
        timerecorded = strftime("%Y-%m-%dT%H:%M+00", gmtime())
        value = timerecorded
        finalDataList = []
        """Loops removes unnecessary rows"""
        for row in range(len(new_data) - 4):
            finalDataList.append(new_data[row])
        print(finalDataList)

        with open(filname, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(finalDataList)
        driver.close()
        driver.quit()
        # os.system('pkill firefox')
        # os.system('pkill plugin-container')
        # os.system('pkill geckodriver')clear
        
if __name__ == '__main__':
    obj = crawlHydro()
    obj.crawl()

有人能帮我沿着正确的方向走吗


Tags: 文件csvinfromimportselffordriver