Python脚本调度程序

2024-06-08 21:50:23 发布

您现在位置:Python中文网/ 问答频道 /正文

各位!! 现在我尝试每小时运行一次python解析器(web scraper)(最好每10分钟运行一次)。 我试过:

Windows scheduler (task scheduler)
pythonanywhere
heroku

希罗库: 我有一个错误:

2020-11-23T09:05:02.707582+00:00 app[web.1]:   File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 73, in __init__
2020-11-23T09:05:02.707740+00:00 app[web.1]:     self.service.start()
2020-11-23T09:05:02.707741+00:00 app[web.1]:   File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/common/service.py", line 83, in start
2020-11-23T09:05:02.707894+00:00 app[web.1]:     os.path.basename(self.path), self.start_error_message)
2020-11-23T09:05:02.707952+00:00 app[web.1]: selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
2020-11-23T09:05:02.707953+00:00 app[web.1]: 
2020-11-23T09:05:02.811239+00:00 heroku[web.1]: Process exited with status 1
2020-11-23T09:05:02.851583+00:00 heroku[web.1]: State changed from starting to crashed
2020-11-23T11:01:39.422624+00:00 heroku[router]: at=error code=H10 desc="App crashed" method=GET path="/" host=zoho-parser.herokuapp.com request_id=2135d635-fa47-47e5-900f-71d19afe1032 fwd="188.64.166.28" dyno= connect= service= status=503 bytes= protocol=https
2020-11-23T11:01:40.985460+00:00 heroku[router]: at=error code=H10 desc="App crashed" method=GET path="/favicon.ico" host=zoho-parser.herokuapp.com request_id=806a1d26-3541-4cd5-ab46-d73288e3e589 fwd="188.64.166.28" dyno= connect= service= status=503 bytes= protocol=https

(我成功部署应用程序)

我的构建包: heroku/python,https://github.com/heroku/heroku-buildpack-google-chromehttps://buildpack-registry.s3.amazonaws.com/buildpacks/heroku/google-chrome.tgzhttps://github.com/heroku/heroku-buildpack-chromedriver

配置变量: CHROMEDRIVER_PATH=/app/.CHROMEDRIVER/bin/CHROMEDRIVER GOOGLE\u CHROME\u BIN=/app/.apt/usr/BIN/GOOGLE CHROME

Pythony,其中:

Traceback (most recent call last):
  File "/home/AndrewCreator/Parser.py", line 27, in <module>
    driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options = chromeOptions)
TypeError: __init__() got an unexpected keyword argument 'options'

在windows任务计划程序中,我成功创建了一个任务,脚本工作!:)

但只有当我在电脑上工作时:()

我的平台-寡妇10

我的脚本:

Parser.py:

#Parser Zoho
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import yagmail
import os

from CheckLetter import CheckLetter
from SetValue import SetValue


url = 'https://help.zoho.com/portal/en/community/zoho-crm'
CHROMEDRIVER_PATH = r'C:\Users\admin\AppData\Roaming\Python\Python38\site-packages\ChromeDriver\chromedriver.exe'
yag = yagmail.SMTP("arshostak@gmail.com", 'MyPassword')
to = "arshostak@gmail.com"
subject = "New question"
body = ["Someone asked about ", ". Here is the link:"]
#driver
chromeOptions = webdriver.ChromeOptions()
chromeOptions.binary_lacation = os.environ.get("GOOGLE_CHROME_BIN")
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--disable-dev-shm-usage")
chromeOptions.add_argument("--no-sandbox")
driver = webdriver.Chrome(executable_path=str(os.environ.get('CHROMEDRIVER_PATH')), options = chromeOptions)
driver.get(url)

element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located(
        (By.CSS_SELECTOR, "div.TopicListLeftContainer__section")
    )
)

isFind = False

for idx, q in enumerate(driver.find_elements_by_css_selector('div.CommunityListItem__wrapper')):
    question = q.find_element_by_tag_name('a')
    shortDiscription = q.find_element_by_css_selector('div.CommunityListItem__description')
    link = question.get_attribute("href")

    shortDiscriptionText = shortDiscription.text
    questionText = question.text
    letter = str(questionText) + ". " + str(shortDiscriptionText)
    letter = letter.replace(" ", "")

    FountProgramm = SetValue(CheckLetter(letter, "Zoho"), questionText, link)
    if (FountProgramm != False):
        yag.send(to, subject, body[0] + FountProgramm + body[1] + "\n" + str(link))
        print(link)
        isFind = True
    pass
pass

driver.quit()

#Parser StackOverflow
import requests
from bs4 import BeautifulSoup

url = "https://stackoverflow.com/questions"

res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
questions = soup.select(".summary")

for que in questions:
    QustionName = que.select_one('.question-hyperlink').getText()
    QuestionLink = "https://stackoverflow.com/" + que.find('a').get('href')
    letter = QustionName.replace(" ", "").lower()

    FountProgramm = SetValue(CheckLetter(letter, "StackOverflow"), QustionName, QuestionLink)
    if (FountProgramm != False):
        yag.send(to, subject, body[0] + FountProgramm + body[1] + "\n" + str(link))
        print(QuestionLink)
        isFind = True
    pass
pass

if (isFind == False):
    print("False")
pass

CheckLetter.py:

def CheckLetter(Letter, MainPlatform):
    triggers = GetTriggers(MainPlatform)
    for platform in triggers:
        for trigs in triggers[triggers.index(platform)]:
            if str(type(trigs)) != "<class 'list'>":
                name = str(trigs)
                continue
            pass
            requiredNum = len(trigs)
            fountNum = 0
            for trig in trigs:
                if (trig.lower() in Letter.lower()):
                    fountNum += 1
                pass
            pass
            if (fountNum >= requiredNum):
                return name
            pass
        pass
    pass
    return False
pass

def GetTriggers(Platform):
    if (Platform == "Zoho"):
        triggers = [["Twilio", ["twillio"], ["twilio"], ["tvilio"], ["tvillio"]],
                    ["Quickbooks", ["Quickbooks"], ["Quick-books"], ["QBO"]],
                    ["Xero", ["Xero"]]
                    ]
        return triggers
    pass
    if (Platform == "StackOverflow"):
        triggers = [["Twilio", ["twillio", "Zoho"], ["twilio", "Zoho"], ["tvilio", "Zoho"], ["tvillio", "Zoho"]],
                    ["Quickbooks", ["Quickbooks", "Zoho"], ["Quick-books", "Zoho"], ["QBO", "Zoho"]],
                    ["Xero", ["Xero", "Zoho"]]
                    ]
        return triggers
    pass
    return False
pass

SetValue.py:

import gspread
from oauth2client.service_account import ServiceAccountCredentials
from pprint import pprint
from datetime import datetime

spreadsheetName = "Questions"
scope = ["https://spreadsheets.google.com/feeds",'https://www.googleapis.com/auth/spreadsheets',"https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
client = gspread.authorize(creds)
sheet = client.open(spreadsheetName).sheet1

data = sheet.get_all_records()
column = sheet.col_values(4)

def SetValue(platform, name, link):
    if (platform == False):
        return False
    pass
    for value in column:
        if (value == str(link)):
            return False
            break
        pass
    pass

    insertRow = [str(datetime.now())[:-7],platform, name, link]
    sheet.insert_row(insertRow, 2)
    return platform
pass

此外,我还有creds.json(和我的私人kay…)

另外,我不想在我的电脑上运行python脚本(有时谷歌会阻止我的搜索)

如果有人能解决其中一个错误,或者向我推荐另一个好的平台(可以付费,但最好是免费的),我会非常高兴 (我尝试运行脚本大约1.5周:(


Tags: infromhttpsimportcomfalseappheroku