BeautifulSoup将数据打印到屏幕上，但不会显示

import requests #grabs HTML from remote file from bs4 import BeautifulSoup #core scraping module import csv #prints results to a file import time #adds delay and avoid overloading host server from urllib.request import urlopen # create file to hold scraped data. artist_csv_file = open('artist_data.csv', 'w') csv_writer = csv.writer(artist_csv_file) # nominate header columns for the CSV csv_writer.writerow(['date_text', 'artist', 'track', 'url']) # read in a list of URLs to scrap (testing with 3 of 500+ pages) contents = [] with open('Rage-links.csv','r') as csvf: # Open file in read mode urls = csv.reader(csvf) for url in urls: page = urlopen(url[0]).read() rage = BeautifulSoup(page, 'html.parser') # identify page section with artist and track names playlist = rage.find('div', class_='comp-rich-text article-text clearfix') # open a CSV file to collect results. And set file's column headers artist_csv_file = open('artist_data.csv', 'w') csv_writer = csv.writer(artist_csv_file) csv_writer.writerow(['date_text', 'artist', 'track', 'url']) # capture date from the page rage_date = rage.find('div', class_='view-comp-publish-date') date_text = rage_date.time.text # define the artist and define song title as its sibling artist = playlist.find('strong') # track = playlist.strong.next_sibling # loop through the HTML, write date, artist and track data. for artist in playlist.find_all('strong'): print(date_text) print(artist) print(artist.next_sibling) print(url) time.sleep(.1) # delay scrape for .1 seconds csv_writer.writerow([date_text, artist, artist.next_sibling, url]) # close the CSV file artist_csv_file.close()

1条回答

网友

1楼 · 发布于 2024-06-16 08:27:23

artist_csv_file = open('artist_data.csv', 'w')

由于“w”，这一行用每个循环覆盖您的文件。尝试使用'a'作为append，然后它会在文件末尾附加结果evrey循环

您可能应该在循环之前初始化文件以添加列标题，否则它可能会在每个循环中写入一行新的标题

[...]
urls = csv.reader(csvf)

#create/clean artist_data.csv and insert column headers 
with open ('artist_data.csv', 'w+') as artist_csv_file:
    csv_writer = csv.writer(artist_csv_file)
    csv_writer.writerow(['date_text', 'artist', 'track', 'url'])

# this opens the file once for writing the column headers, this first opening with 'w+' makes sure previous content gets cleaned
# if the file is always empty when you run the program you could all in one context manager with 'a+'. 


#now open the csv in append mode and do the scraping
with open('artist_data.csv', 'a') as artist_csv_file:

    csv_writer = csv.writer(artist_csv_file)

    for url in urls:

        [...]
        # these lines should be removed from the loop-body
        artist_csv_file = open('artist_data.csv', 'w')
        csv_writer = csv.writer(artist_csv_file)
        csv_writer.writerow(['date_text', 'artist', 'track', 'url'])
        [...]


# no close statement at the end needed.

希望有帮助。玩得开心

相关问题更多 >

编程相关推荐

热门问题

热门文章