Python值重塑issu

2024-04-26 07:41:33 发布

您现在位置:Python中文网/ 问答频道 /正文

我在用刮刀。你知道吗

https://github.com/PHMark/Web-Scraping

我根据自己的需要进行了修改,代码如下:

from bs4 import BeautifulSoup as bs
from selenium import webdriver
import urllib.request, urllib.error, urllib.parse
import re
import ssl
import pandas as pd
import numpy as np
import os

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--no-sandbox')
prefs = {'download.default_directory' : os.getcwd()}
chrome_options.add_experimental_option('prefs', prefs)

class SoupMaker():
    """
    A class that scrapes indeed's Job ads
    """
    def __init__(self, _url, _driver):
        self.base_url = "https://www.indeed.com"
        self.home_url = self.base_url + _url
        self.job_links = []
        self.driver = _driver
        self.job_datas = []
        self.job_table = []

    def read_page(self):        
        self.ctx = ssl.create_default_context()
        self.ctx.check_hostname = False
        self.ctx.verify_mode = ssl.CERT_NONE
        print("Parsing: ", self.home_url)
        self.url = urllib.request.urlopen(self.home_url,
                              context = self.ctx).read()
        _soup1 = bs(self.url, "html.parser")
        self.a_tags = _soup1('a')

    def get_job_url(self):
        for link in self.a_tags:
            link = link.get("href", None)
            if link != None:
                cmp_url = re.search("^/.+/.+/jobs/.+", link)
                rc_url = re.search("^/rc.+", link)
                if cmp_url or rc_url:
                    self.job_links.append(self.base_url + link.strip())

    def get_job_info(self):
        for link in self.job_links:
            print("    Scraping: ", link)
            self.driver.get(link)
            self.driver.implicitly_wait(2750)
            _soup2 = bs(self.driver.page_source, "lxml")
            self.title = _soup2.find("title").get_text()
            self.job_descs = _soup2.find_all('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')
            self.job_origins = _soup2.find_all('div', 'jobsearch-JobMetadataFooter')

            self.job_title = re.findall("(.+) - .+ - .+", self.title)[0]
            self.job_location = re.findall(".+ - (.+) - .+", self.title)[0]
            self.description = ''
            for d in self.job_descs:
                self.description += d.get_text("|", strip = True) 
            self.origin = re.findall("^.+ ago", self.job_origins[0].get_text())[0]    
            self.job_datas.append(self.job_title)
            self.job_datas.append(self.job_location)
            self.job_datas.append(self.description)
            self.job_datas.append(self.origin)

        self.x = np.array(self.job_datas).reshape((10,4))
        df = pd.DataFrame(data=self.x, columns=['Job Title', 'Job Location',
                                    'Job Description', 'Job Origin'])
        return df

if __name__ == '__main__':
    n = int(input("Enter no. of pages to scrape: "))
    n = n*10
    file_name = input("Enter CSV filename: ")
    #    driver = webdriver.Chrome(r"C:\chromedriver\chromedriver.exe")
    #driver = webdriver.Chrome('/usr/local/bin/chromedrive') 
    driver = webdriver.Chrome('/usr/local/bin/chromedriver',chrome_options=chrome_options)  
    chrome_options=chrome_options
    writer = pd.ExcelWriter('{0}.xlsx'.format(file_name), engine='xlsxwriter')
    df = []

    for i in range(10, n+10, 10):
        #ext = "/jobs?q=&l=United+States&start={0}".format(i-10)
        ext = "/jobs?l=United+States&start={0}".format(i-10)
        if n == 10:
            #ext = "/jobs-in-United+States"
            ext ="/l-United+States-jobs.html"
        s = SoupMaker(ext, driver)
        s.read_page()
        s.get_job_url()
        df.append(s.get_job_info())

    result = pd.concat(df)
    result.to_excel(writer, index=False)
    writer.save()
    driver.close()

如果我只尝试刮除1页,脚本工作正常,但如果我尝试刮除10页以上,则会出现以下错误:

Traceback (most recent call last): File "file.py", line 96, in (module) df.append(s.get_job_info()) File "file.py", line 71, in get_job_info self.x = np.array(self.job_datas).reshape((10,4)) ValueError: cannot reshape array of size 0 into shape (10,4)

如果页面的输入大于100或50,则会出现以下错误:

Traceback (most recent call last): File "file.py", line 100, in df.append(s.get_job_info()) File "file.py", line 64, in get_job_info self.job_title = re.findall("(.+) - .+ - .+", self.title)[0] IndexError: list index out of range

如果有人能帮我,我会非常感激的!提前谢谢!你知道吗


Tags: inimportselfreurldfgettitle
1条回答
网友
1楼 · 发布于 2024-04-26 07:41:33

仔细看一下,我认为问题在于它实际上并没有检索到任何数据。如果在“get\u job\u url”方法中没有解析任何链接,“get\u job\u info”中的循环将不会运行,数组将是零维的。这将导致重塑失败。你知道吗

为了更好地理解正在发生的事情,可以尝试使用调试器来检查状态,或者只使用prints来更好地了解正在发生的事情。这可能是10页的网址是坏的,并给出一个404页,其中没有任何链接。你知道吗

相关问题 更多 >