我在使用python抓取标题URL时出错

import requests from bs4 import BeautifulSoup # import pandas as pd # import pandas as pd import csv def get_page(url): response = requests.get(url) if not response.ok: print('server responded:', response.status_code) else: # 1. html , 2. parser soup = BeautifulSoup(response.text, 'html.parser') return soup def get_index_data(soup): try: titles_link = soup.find_all('a', class_="body_link_11") except: titles_link = [] # urls = [item.get('href') for item in titles_link] print(titles_link) def main(): mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/" \ "searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1" get_index_data(get_page(mainurl)) if __name__ == '__main__': main()

1条回答

网友

1楼 · 发布于 2024-04-23 18:09:40

如果要获取所有链接，请尝试以下操作：

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_index_data(soup):
    try:
        titles_link = soup.find_all('a',class_="body_link_11")
    except:
        titles_link = []
    else:
        titles_link_output = []
        for link in titles_link:
            try:
                item_id = link.attrs.get('item_id', None) # All titles with valid links will have an item_id
                if item_id:
                    titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
            except:
                continue
        print(titles_link_output)

def main():
    mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
    get_index_data(get_page(mainurl))

main()

输出：

['http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3309/rec/3', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2425/rec/4', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/150/rec/5', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3672/rec/8', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3407/rec/9', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4393/rec/10', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3445/rec/11', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3668/rec/12', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3703/rec/13', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2952/rec/14', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2898/rec/15', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3502/rec/16', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3553/rec/17', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4052/rec/18', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3440/rec/19', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3583/rec/20']

相关问题更多 >

编程相关推荐

热门问题

热门文章