各位!! 现在我尝试每小时运行一次python解析器(web scraper)(最好每10分钟运行一次)。 我试过:
Windows scheduler (task scheduler)
pythonanywhere
heroku
希罗库: 我有一个错误:
2020-11-23T09:05:02.707582+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 73, in __init__
2020-11-23T09:05:02.707740+00:00 app[web.1]: self.service.start()
2020-11-23T09:05:02.707741+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/common/service.py", line 83, in start
2020-11-23T09:05:02.707894+00:00 app[web.1]: os.path.basename(self.path), self.start_error_message)
2020-11-23T09:05:02.707952+00:00 app[web.1]: selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
2020-11-23T09:05:02.707953+00:00 app[web.1]:
2020-11-23T09:05:02.811239+00:00 heroku[web.1]: Process exited with status 1
2020-11-23T09:05:02.851583+00:00 heroku[web.1]: State changed from starting to crashed
2020-11-23T11:01:39.422624+00:00 heroku[router]: at=error code=H10 desc="App crashed" method=GET path="/" host=zoho-parser.herokuapp.com request_id=2135d635-fa47-47e5-900f-71d19afe1032 fwd="188.64.166.28" dyno= connect= service= status=503 bytes= protocol=https
2020-11-23T11:01:40.985460+00:00 heroku[router]: at=error code=H10 desc="App crashed" method=GET path="/favicon.ico" host=zoho-parser.herokuapp.com request_id=806a1d26-3541-4cd5-ab46-d73288e3e589 fwd="188.64.166.28" dyno= connect= service= status=503 bytes= protocol=https
(我成功部署应用程序)
我的构建包: heroku/python,https://github.com/heroku/heroku-buildpack-google-chrome,https://buildpack-registry.s3.amazonaws.com/buildpacks/heroku/google-chrome.tgz,https://github.com/heroku/heroku-buildpack-chromedriver
配置变量: CHROMEDRIVER_PATH=/app/.CHROMEDRIVER/bin/CHROMEDRIVER GOOGLE\u CHROME\u BIN=/app/.apt/usr/BIN/GOOGLE CHROME
Pythony,其中:
Traceback (most recent call last):
File "/home/AndrewCreator/Parser.py", line 27, in <module>
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options = chromeOptions)
TypeError: __init__() got an unexpected keyword argument 'options'
在windows任务计划程序中,我成功创建了一个任务,脚本工作!:)
但只有当我在电脑上工作时:()
我的平台-寡妇10
我的脚本:
Parser.py:
#Parser Zoho
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import yagmail
import os
from CheckLetter import CheckLetter
from SetValue import SetValue
url = 'https://help.zoho.com/portal/en/community/zoho-crm'
CHROMEDRIVER_PATH = r'C:\Users\admin\AppData\Roaming\Python\Python38\site-packages\ChromeDriver\chromedriver.exe'
yag = yagmail.SMTP("arshostak@gmail.com", 'MyPassword')
to = "arshostak@gmail.com"
subject = "New question"
body = ["Someone asked about ", ". Here is the link:"]
#driver
chromeOptions = webdriver.ChromeOptions()
chromeOptions.binary_lacation = os.environ.get("GOOGLE_CHROME_BIN")
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--disable-dev-shm-usage")
chromeOptions.add_argument("--no-sandbox")
driver = webdriver.Chrome(executable_path=str(os.environ.get('CHROMEDRIVER_PATH')), options = chromeOptions)
driver.get(url)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "div.TopicListLeftContainer__section")
)
)
isFind = False
for idx, q in enumerate(driver.find_elements_by_css_selector('div.CommunityListItem__wrapper')):
question = q.find_element_by_tag_name('a')
shortDiscription = q.find_element_by_css_selector('div.CommunityListItem__description')
link = question.get_attribute("href")
shortDiscriptionText = shortDiscription.text
questionText = question.text
letter = str(questionText) + ". " + str(shortDiscriptionText)
letter = letter.replace(" ", "")
FountProgramm = SetValue(CheckLetter(letter, "Zoho"), questionText, link)
if (FountProgramm != False):
yag.send(to, subject, body[0] + FountProgramm + body[1] + "\n" + str(link))
print(link)
isFind = True
pass
pass
driver.quit()
#Parser StackOverflow
import requests
from bs4 import BeautifulSoup
url = "https://stackoverflow.com/questions"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
questions = soup.select(".summary")
for que in questions:
QustionName = que.select_one('.question-hyperlink').getText()
QuestionLink = "https://stackoverflow.com/" + que.find('a').get('href')
letter = QustionName.replace(" ", "").lower()
FountProgramm = SetValue(CheckLetter(letter, "StackOverflow"), QustionName, QuestionLink)
if (FountProgramm != False):
yag.send(to, subject, body[0] + FountProgramm + body[1] + "\n" + str(link))
print(QuestionLink)
isFind = True
pass
pass
if (isFind == False):
print("False")
pass
CheckLetter.py:
def CheckLetter(Letter, MainPlatform):
triggers = GetTriggers(MainPlatform)
for platform in triggers:
for trigs in triggers[triggers.index(platform)]:
if str(type(trigs)) != "<class 'list'>":
name = str(trigs)
continue
pass
requiredNum = len(trigs)
fountNum = 0
for trig in trigs:
if (trig.lower() in Letter.lower()):
fountNum += 1
pass
pass
if (fountNum >= requiredNum):
return name
pass
pass
pass
return False
pass
def GetTriggers(Platform):
if (Platform == "Zoho"):
triggers = [["Twilio", ["twillio"], ["twilio"], ["tvilio"], ["tvillio"]],
["Quickbooks", ["Quickbooks"], ["Quick-books"], ["QBO"]],
["Xero", ["Xero"]]
]
return triggers
pass
if (Platform == "StackOverflow"):
triggers = [["Twilio", ["twillio", "Zoho"], ["twilio", "Zoho"], ["tvilio", "Zoho"], ["tvillio", "Zoho"]],
["Quickbooks", ["Quickbooks", "Zoho"], ["Quick-books", "Zoho"], ["QBO", "Zoho"]],
["Xero", ["Xero", "Zoho"]]
]
return triggers
pass
return False
pass
SetValue.py:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from pprint import pprint
from datetime import datetime
spreadsheetName = "Questions"
scope = ["https://spreadsheets.google.com/feeds",'https://www.googleapis.com/auth/spreadsheets',"https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
client = gspread.authorize(creds)
sheet = client.open(spreadsheetName).sheet1
data = sheet.get_all_records()
column = sheet.col_values(4)
def SetValue(platform, name, link):
if (platform == False):
return False
pass
for value in column:
if (value == str(link)):
return False
break
pass
pass
insertRow = [str(datetime.now())[:-7],platform, name, link]
sheet.insert_row(insertRow, 2)
return platform
pass
此外,我还有creds.json(和我的私人kay…)
另外,我不想在我的电脑上运行python脚本(有时谷歌会阻止我的搜索)
如果有人能解决其中一个错误,或者向我推荐另一个好的平台(可以付费,但最好是免费的),我会非常高兴 (我尝试运行脚本大约1.5周:(
目前没有回答
相关问题 更多 >
编程相关推荐