如何用漂亮的汤在python中刮出整个整数？

#Author: David Owens #File name: soupScraper.py #Description: html scraper that takes surf reports from various websites import csv import requests from bs4 import BeautifulSoup NUM_SITES = 2 reportsFinal = [] ###################### SURFLINE URL STRINGS AND TAG ########################### slRootUrl = 'http://www.surfline.com/surf-report/' slSunsetCliffs = 'sunset-cliffs-southern-california_4254/' slScrippsUrl = 'scripps-southern-california_4246/' slBlacksUrl = 'blacks-southern-california_4245/' slCardiffUrl = 'cardiff-southern-california_4786/' slTagText = 'observed-wave-range' slTag = 'id' #list of surfline URL endings slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl] ############################################################################### #################### MAGICSEAWEED URL STRINGS AND TAG ######################### msRootUrl = 'http://magicseaweed.com/' msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/' msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/' msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/' msTagText = 'rating-text' msTag = 'li' #list of magicseaweed URL endings msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl] ############################################################################### ''' This class represents a surf break. It contains all wave, wind, & tide data associated with that break relevant to the website ''' class surfBreak: def __init__(self, name,low, high, wind, tide): self.name = name self.low = low self.high = high self.wind = wind self.tide = tide #toString method def __str__(self): return '{0}: Wave height: {1}-{2} Wind: {3} Tide: {4}'.format(self.name, self.low, self.high, self.wind, self.tide) #END CLASS ''' This returns the proper attribute from the surf report sites ''' def reportTagFilter(tag): return (tag.has_attr('class') and 'rating-text' in tag['class']) \ or (tag.has_attr('id') and tag['id'] == 'observed-wave-range') #END METHOD ''' This method checks if the parameter is of type int ''' def representsInt(s): try: int(s) return True except ValueError: return False #END METHOD ''' This method extracts all ints from a list of reports reports: The list of surf reports from a single website returns: reportNums - A list of ints of the wave heights ''' def extractInts(reports): print reports reportNums = [] afterDash = False num = 0 tens = 0 ones = 0 #extract all ints from the reports and ditch the rest for report in reports: for char in report: if representsInt(char) == True: num = int(char) reportNums.append(num) else: afterDash = True return reportNums #END METHOD ''' This method iterates through a list of urls and extracts the surf report from the webpage dependent upon its tag location rootUrl: The root url of each surf website urlList: A list of specific urls to be appended to the root url for each break tag: the html tag where the actual report lives on the page returns: a list of strings of each breaks surf report ''' def extractReports(rootUrl, urlList, tag, tagText): #empty list to hold reports reports = [] reportNums = [] index = 0 #loop thru URLs for url in urlList: try: index += 1 #request page request = requests.get(rootUrl + url) #turn into soup soup = BeautifulSoup(request.content, 'lxml') #get the tag where surflines report lives reportTag = soup.findAll(reportTagFilter)[0] reports.append(reportTag.text.strip()) #notify if fail except: print 'scrape failure at URL ', index pass reportNums = extractInts(reports) return reportNums #END METHOD ''' This method calculates the average of the wave heights ''' def calcAverages(reportList): #empty list to hold averages finalAverages = [] listIndex = 0 waveIndex = 0 #loop thru list of reports to calc each breaks ave low and high for x in range(0, 6): #get low ave average = (reportList[listIndex][waveIndex] + reportList[listIndex+1][waveIndex]) / NUM_SITES finalAverages.append(average) waveIndex += 1 return finalAverages #END METHOD slReports = extractReports(slRootUrl, slUrls, slTag, slTagText) msReports = extractReports(msRootUrl, msUrls, msTag, msTagText) reportsFinal.append(slReports) reportsFinal.append(msReports) print 'Surfline: ', slReports print 'Magicseaweed: ', msReports

1条回答

网友

1楼 · 发布于 2024-06-02 08:59:22

实际上并不是提取整数，而是浮点，因为reports中的值类似于['0.3-0.6 m']。现在你只需要遍历每个字符，然后一个一个地将它们转换成int，或者丢弃。所以难怪你只能得到个位数。在

可以说，从regexp中提取数字的方法很简单：

import re

FLOATEXPR = re.compile("(\d+\.\d)-(\d+\.\d) {0,1}m")

def extractFloats(reports):
    reportNums = []
    for report in reports:
        groups = re.match(FLOATEXPR, report).groups()
        for group in groups:
            reportNums.append(float(group))
    return reportNums

这个表达式将匹配您的float并将它们作为一个列表返回。在

具体地说，表达式将匹配任何在'.'之前和之后至少有一个数字、在它之间有一个'-'、另一个浮点序列并以'm'或' m'结尾的任何内容。然后它将表示浮点数的部分分组到一个元组中。例如，['12.0m-3.0m']将返回[12.0, 3.0]。如果您希望它在浮点后面有更多的数字，可以在表达式中的第二个'd':s之后添加一个额外的'+'。在

相关问题更多 >

编程相关推荐

热门问题

热门文章