使用请求时未提供架构和其他错误。get（）

# Saves the XKCD comic page for offline read import requests, os, bs4, shutil url = 'http://xkcd.com/' if os.path.isdir('xkcd') == True: # If xkcd folder already exists shutil.rmtree('xkcd') # delete it else: # otherwise os.makedirs('xkcd') # Creates xkcd foulder. while not url.endswith('#'): # If there are no more posts, it url will endswith #, exist while loop # Download the page print 'Downloading %s page...' % url res = requests.get(url) # Get the page res.raise_for_status() # Check for errors soup = bs4.BeautifulSoup(res.text) # Dowload the page # Find the URL of the comic image comicElem = soup.select('#comic img') # Any #comic img it finds will be saved as a list in comicElem if comicElem == []: # if the list is empty print 'Couldn\'t find the image!' else: comicUrl = comicElem[0].get('src') # Get the first index in comicElem (the image) and save to # comicUrl # Download the image print 'Downloading the %s image...' % (comicUrl) res = requests.get(comicUrl) # Get the image. Getting something will always use requests.get() res.raise_for_status() # Check for errors # Save image to ./xkcd imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb') for chunk in res.iter_content(10000): imageFile.write(chunk) imageFile.close() # Get the Prev btn's URL prevLink = soup.select('a[rel="prev"]')[0] # The Previous button is first <a rel="prev" href="/1535/" accesskey="p">< Prev</a> url = 'http://xkcd.com/' + prevLink.get('href') # adds /1535/ to http://xkcd.com/ print 'Done!'

Traceback (most recent call last): File "/Users/XKCD.py", line 30, in <module> res = requests.get(comicUrl) # Get the image. Getting something will always use requests.get() File "/Library/Python/2.7/site-packages/requests/api.py", line 69, in get return request('get', url, params=params, **kwargs) File "/Library/Python/2.7/site-packages/requests/api.py", line 50, in request response = session.request(method=method, url=url, **kwargs) File "/Library/Python/2.7/site-packages/requests/sessions.py", line 451, in request prep = self.prepare_request(req) File "/Library/Python/2.7/site-packages/requests/sessions.py", line 382, in prepare_request hooks=merge_hooks(request.hooks, self.hooks), File "/Library/Python/2.7/site-packages/requests/models.py", line 304, in prepare self.prepare_url(url, params) File "/Library/Python/2.7/site-packages/requests/models.py", line 362, in prepare_url to_native_string(url, 'utf8'))) requests.exceptions.MissingSchema: Invalid URL '//imgs.xkcd.com/comics/the_martian.png': No schema supplied. Perhaps you meant http:////imgs.xkcd.com/comics/the_martian.png?

3条回答

网友

1楼 · 编辑于 2024-06-06 13:32:04

将comicUrl更改为

comicUrl = comicElem[0].get('src').strip("http://")
comicUrl="http://"+comicUrl
if 'xkcd' not in comicUrl:
    comicUrl=comicUrl[:7]+'xkcd.com/'+comicUrl[7:]

print "comic url",comicUrl

网友

2楼 · 编辑于 2024-06-06 13:32:04

没有模式意味着您没有提供http://或https://提供这些，它将完成这项任务。

编辑：看看这个URL字符串！以下内容：

URL '//imgs.xkcd.com/comics/the_martian.png':

网友

3楼 · 编辑于 2024-06-06 13:32:04

说明：

一些XKCD页面有特殊的内容，而不是简单的图像文件。没关系，你可以跳过这些。如果选择器找不到任何元素，则soup.select（'35; comic img'）将返回一个空列表。

工作代码：

import requests,os,bs4,shutil

url='http://xkcd.com'

#making new folder
if os.path.isdir('xkcd') == True:
    shutil.rmtree('xkcd')
else:
    os.makedirs('xkcd')


#scrapiing information
while not url.endswith('#'):
    print('Downloading Page %s.....' %(url))
    res = requests.get(url)          #getting page
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text)

    comicElem = soup.select('#comic img')     #getting img tag under  comic divison
    if comicElem == []:                        #if not found print error
        print('could not find comic image')

    else:
        try:
            comicUrl = 'http:' + comicElem[0].get('src')             #getting comic url and then downloading its image
            print('Downloading image %s.....' %(comicUrl))
            res = requests.get(comicUrl)
            res.raise_for_status()

        except requests.exceptions.MissingSchema:
        #skip if not a normal image file
            prev = soup.select('a[rel="prev"]')[0]
            url = 'http://xkcd.com' + prev.get('href')
            continue

        imageFile = open(os.path.join('xkcd',os.path.basename(comicUrl)),'wb')     #write  downloaded image to hard disk
        for chunk in res.iter_content(10000):
            imageFile.write(chunk)
        imageFile.close()

        #get previous link and update url
        prev = soup.select('a[rel="prev"]')[0]
        url = "http://xkcd.com" + prev.get('href')


print('Done...')

相关问题更多 >

编程相关推荐

热门问题

热门文章