我试图从一个网站上提取大量的数据(https://www.cs.cmu.edu/~mleone/gdead/setlists.html)我正在进行一个数据可视化项目,以显示有关感恩死音乐会的信息。我已经成功地提取和格式化了我想要的数据,但是这个过程非常缓慢。我使用urllib库打开并读取每个url(有成百上千的网址)。有没有更好的办法
class Song:
def __init__(self, name, year):
self.name = name
self.year = year
def printName(self):
print(self.name)
def getName(self):
return self.name
class Year:
def __init__(self, year, dct):
self.year = year
self.dct = {}
def addSong(self, song):
if song in self.dct:
self.dct[song] += 1
else:
self.dct[song] = 1
def printDict(self):
print(dct)
def printYear(self):
print(self.year)
def getYear(self):
return self.year
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
import urllib
songlist = []
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def remove_values_from_list(the_list, val):
return [value for value in the_list if value != val]
list_open = open("concert_list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
#empty dictionary
yeardict = {}
#populate dictionary with Year objects from 1972-1995
for i in range(0,24):
yeardict[i+72] = Year(i+72, {})
# for each website located in concert_list.txt
for url in line_in_list:
soup = BeautifulSoup(urlopen(url).read(), 'html.parser')
newurl = []
# find each link extension
for link in soup.find_all('a'):
newurl.append('https://www.cs.cmu.edu/~mleone/gdead/' + link.get('href'))
# parse through each full url
for url in newurl:
soup = BeautifulSoup(urlopen(url).read(), 'html')
# parse something special in the file
paragraphs = soup.find_all('p')
x = []
# populate x by splitting the file into a list
for p in paragraphs:
x = p.getText().split('\n')
# remove blanks from song list
x = remove_values_from_list(x, '')
# for each song in the song list
for song in x:
if(hasNumbers(song)):
year = song[song.rfind("/")+1:song.rfind("/")+3]
else:
cursong = Song(song,year)
#yeardict[int(year)].printYear()
yeardict[int(year)].addSong(cursong.getName())
print(yeardict[72].dct["Truckin'"]) #song name
目前没有回答
相关问题 更多 >
编程相关推荐