如何通过beautifulsoup搜索特定的类？

import re from bs4 import BeautifulSoup from bs4 import SoupStrainer import os import httplib2 def make_soup(s): match=re.compile('https://|http://|www.|.com|.in|.org|gov.in') if re.search(match,s): http = httplib2.Http() status, response = http.request(s) page = BeautifulSoup(response,parse_only=SoupStrainer('a')) return page else: return None def is_a_valid_link(href): match1=re.compile('http://|https://') match2=re.compile('/r/WritingPrompts/comments/') match3=re.compile('modpost') return re.search(match1,href) and re.search(match2,href) and not re.search(match3,href) def parse(s): c=0 flag=0 soup=make_soup(s) match4=re.compile('comments') if(soup!=None): for tag in soup.find_all('a',attrs={'class':['title may-blank loggedin']}): #if(link['class']!=['author may-blank loggedin']): #if(not re.search(re.compile('/r/WritingPrompts/comments/'),link['href'])): print(tag.string) #break flag=1 c=c+1 def count_next_of_current(s): soup=make_soup(s) match=re.compile('https://www.reddit.com/r/WritingPrompts/?count=') for link in soup.find_all('a',{'rel':['next']}): href=link['href'] return href def read_reddit_images(): global f f=open('spaceporn.txt','w') i=int(input('Enter the number of NEXT pages from the front WritingPrompts page that you want to scrape\n')) s='https://www.reddit.com/r/WritingPrompts/' soup=make_soup(s) parse(s) count=0 while(count<i): s=count_next_of_current(s) if(s!=None): parse(s) count=count+1 else: break f.close() read_reddit_images()

2条回答

网友

1楼 · 编辑于 2024-04-24 05:35:42

使用find\u all（）搜索类的语法是

soup.find_all(class_="className")

课后注意下划线。如果不考虑它，Python会抛出一个异常，因为它认为您正在尝试实例化一个新类。在

网友

2楼 · 编辑于 2024-04-24 05:35:42

每个类属性都作为单独的类存储在BS4中。通过select()方法使用CSS选择器可以更容易地通过多个CSS类进行匹配。例如，可以使用以下CSS选择器来匹配<a class="title may-blank loggedin">：

for tag in soup.select('a.title.may-blank.loggedin'):
    .....
    .....

相关问题更多 >

编程相关推荐

热门问题

热门文章