在使用web时，如何从HTML标记的<class'generator'>部分提取文本部分

import requests from bs4 import BeautifulSoup import operator from collections import Counter def start(url): wordlist=[] source_code=requests.get(url).text soup=BeautifulSoup(source_code,'html.parser') for each_text in soup.findAll('div',{'class':'entry-content'}): content=each_text.strings words=content.lower().split() for each_word in words: wordlist.append(each_word) clean_wordlist(wordlist) def clean_wordlist(wordlist): clean_list=[] for word in wordlist: symbols='!@#$%^&*()_-+={[}]|\;:"<>?/.,' for i in range (0,len(symbols)): word=word.replace(symbols[i],'') if len(word)>0: clean_list.append(word) create_dictionary(clean_list) def create_dictionary(clean_list): word_count={} for word in clean_list: if word in word_count: word_count[word]+=1 else: word_count[word]=1 for key,value in sorted(word_count.items(),key=operator.itemgetter(1)): print ("%s : %s " % (key,value)) c=Counter(word_count) top=c.most_common(3) print(top) start("https://www.geeksforgeeks.org/programming-language-choose/")</code>

1条回答

网友

1楼 · 发布于 2024-04-18 00:59:06

与创建生成器对象不同，我们只使用.text，或者如果我们真的想使用.strings，那么您可以进行解包（即print(*stingsobject)）

正如您所知，我们在对象解包之前使用星号，我将不详细介绍，但您可以找到有关它的更多信息HERE

import requests
from bs4 import BeautifulSoup
import operator
from collections import Counter


def start(url):
  wordlist = []
  source_code = requests.get(url).text
  soup = BeautifulSoup(source_code, 'html.parser')
  for each_text in soup.findAll('div', {'class': 'entry-content'}):
    content = each_text.text
    words = content.lower().split()
    for each_word in words:
        wordlist.append(each_word)
    clean_wordlist(wordlist)


def clean_wordlist(wordlist):
  clean_list = []
  for word in wordlist:
    symbols = '!@#$%^&*()_-+={[}]|\;:"<>?/.,'
    for i in range(0, len(symbols)):
        word = word.replace(symbols[i], '')
    if len(word) > 0:
        clean_list.append(word)
  create_dictionary(clean_list)


def create_dictionary(clean_list):
  word_count = {}
  for word in clean_list:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1
  for key, value in sorted(word_count.items(), key=operator.itemgetter(1)):
    print("%s : %s " % (key, value))
  c = Counter(word_count)
  top = c.most_common(3)
  print(top)

start("https://www.geeksforgeeks.org/programming-language-choose/")

相关问题更多 >

编程相关推荐

热门问题

热门文章