如何使用Python、Selenium和BeautifulSoup将html保存到文本文件

from pathlib import Path from selenium import webdriver from bs4 import BeautifulSoup def test_html_save(): playlist_url = 'https://www.youtube.com/watch?v=IdneKLhsWOQ&list=PLMEZyDHJojxNYSVgRCPt589DI5H7WT1ZK' browser = webdriver.Firefox() browser.get(playlist_url) html_content = browser.page_source # Getting the html from the webpage browser.close() soup = BeautifulSoup(html_content, 'html.parser') # creates a beautiful soup object 'soup'. html_save_path = Path(__file__).parent / ".//html_save_test.txt" with open(html_save_path, 'wt') as html_file: for line in soup.prettify(): html_file.write(line) test_html_save()

1条回答

网友

1楼 · 发布于 2024-04-25 07:14:11

将encoding参数设置为utf-8：

with open(html_save_path, 'wt', encoding='utf-8') as html_file:
    for line in soup.prettify():
        html_file.write(line)

您的目的是从视频中删除视频标题和频道名称。以下是执行此操作的完整代码：

from pathlib import Path
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def test_html_save():
    playlist_url = 'https://www.youtube.com/watch?v=IdneKLhsWOQ&list=PLMEZyDHJojxNYSVgRCPt589DI5H7WT1ZK'
    browser = webdriver.Chrome()
    browser.get(playlist_url)
    time.sleep(4) #Waits for 4 secs until the page loads
    html_content = browser.page_source  # Getting the html from the webpage
    browser.close()
    soup = BeautifulSoup(html_content, 'html.parser') # creates a beautiful soup object 'soup'.

    html_save_path = "D:\\bs4_html.txt"

    with open(html_save_path, 'wt', encoding='utf-8') as html_file:
        for line in soup.prettify():
            html_file.write(line)

    title = soup.find('yt-formatted-string', class_ = 'style-scope ytd-video-primary-info-renderer').text
    channel_name = soup.find('a', class_ = 'yt-simple-endpoint style-scope yt-formatted-string').text
    print(f"Video Title: {title}")
    print(f"Channel Name: {channel_name}")

test_html_save()

输出：

Video Title: Taylor Swift - Wildest Dreams
Channel Name: Taylor Swift

相关问题更多 >

编程相关推荐

热门问题

热门文章