从websi抓取数据

#extracting links def linkextract(soup): print "\n extracting links of next pages" print "\n\n page 2 \n" sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':''})] for i in sAll: suburl = ""+i['href'] #checking pages print suburl pages = mech.open(suburl) content = pages.read() anosoup = BeautifulSoup(content) extract(anosoup) app_url = "" print app_url #print soup.prettify() page1 = mech.open(app_url) html1 = page1.read() soup1 = BeautifulSoup(html1) print "\n\n application page details \n" extractinside(soup1)

1条回答

网友

1楼 · 发布于 2024-06-16 09:33:55

您应该从以下内容开始：

import urllib2
from bs4 import BeautifulSoup

URL = 'http://www.pcwelt.de/download-neuzugaenge.html'

soup = BeautifulSoup(urllib2.urlopen(URL))
links = [tr.td.a['href'] for tr in soup.find('div', {'class': 'boxed'}).table.find_all('tr') if tr.td]

for link in links:
    url = "http://www.pcwelt.de{0}".format(link)
    soup = BeautifulSoup(urllib2.urlopen(url))

    name = soup.find('span', {'itemprop': 'name'}).text
    version = soup.find('td', {'itemprop': 'softwareVersion'}).text
    print "Name: %s; Version: %s" % (name, version)

印刷品：

Name: Ashampoo Clip Finder HD Free; Version: 2.3.6
Name: Many Cam; Version: 4.0.63
Name: Roboform; Version: 7.9.5.7
...

希望有帮助。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章