从websi中提取大量数据

2024-04-20 11:40:45 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图从一个网站上提取大量的数据(https://www.cs.cmu.edu/~mleone/gdead/setlists.html)我正在进行一个数据可视化项目,以显示有关感恩死音乐会的信息。我已经成功地提取和格式化了我想要的数据,但是这个过程非常缓慢。我使用urllib库打开并读取每个url(有成百上千的网址)。有没有更好的办法

class Song:
    def __init__(self, name, year):
        self.name = name
        self.year = year
    def printName(self):
        print(self.name)
    def getName(self):
        return self.name

class Year:
    def __init__(self, year, dct):
        self.year = year
        self.dct = {}
    def addSong(self, song):
        if song in self.dct:
            self.dct[song] += 1
        else:
            self.dct[song] = 1

    def printDict(self):
        print(dct)

    def printYear(self):
        print(self.year)

    def getYear(self):
        return self.year




from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
import urllib

songlist = []

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

list_open = open("concert_list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")

#empty dictionary
yeardict = {}

#populate dictionary with Year objects from 1972-1995
for i in range(0,24):
    yeardict[i+72] = Year(i+72, {})

# for each website located in concert_list.txt
for url in line_in_list:
    soup = BeautifulSoup(urlopen(url).read(), 'html.parser')
    newurl = []
    # find each link extension
    for link in soup.find_all('a'):
        newurl.append('https://www.cs.cmu.edu/~mleone/gdead/' + link.get('href'))

        # parse through each full url
        for url in newurl:
            soup = BeautifulSoup(urlopen(url).read(), 'html')
            # parse something special in the file
            paragraphs = soup.find_all('p')
            x = []
            # populate x by splitting the file into a list
            for p in paragraphs:
                x = p.getText().split('\n')

        # remove blanks from song list
        x = remove_values_from_list(x, '')

        # for each song in the song list
        for song in x:
            if(hasNumbers(song)):
                year = song[song.rfind("/")+1:song.rfind("/")+3]
            else:
                cursong = Song(song,year)
                #yeardict[int(year)].printYear()
                yeardict[int(year)].addSong(cursong.getName())



print(yeardict[72].dct["Truckin'"]) #song name   

Tags: thenameinfromselfurlforread