python正则表达式在示例代码中工作，但不是所需的代码

str = '\xa9 Copyright 2009-10 \n\t\t\t\t All Rights Reserved. (Best viewed in 1024x768 \n\t\t\t\tresolution & IE 6.0) break\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 \nChief Engineer' reSpace = re.compile(' +') reUni = re.compile( '(\\xa9|\\n|\\t|\\xa0)') str = reSpace.sub(' ', str) str = reUni.sub('', str) print str

import re from bs4 import BeautifulSoup import os tagslist = [] # keeps track of the tags that have been encountered filehandle = {} # stores the file handles for every rag reUni = re.compile( '((\\xa9)|(\\n)|(\\t)|(\\xa0))') reSpace = re.compile(' +') page = "filename.html" # html file which needs to be parsed fread = open(page, 'r') soup = BeautifulSoup(fread.read()) fread.close() if re.match( r'.*\.htm$', page): # removes the .html or htm to remove "." to be enable to create a folder named "filename" page = site+"_parsed/"+page[:-4]+"_data" else: page = site+"_parsed/"+page[:-5]+"_data" if not os.path.exists(page): #creates the folder named "filename" os.makedirs(page) for tag in soup.find_all(): if tag.string: #if the tag encountered has a child string or not #if tag is encountered for the first time than create the file to hold its strins and declare the file handle for it if tag.name not in tagslist: tagStrFile = page+ "/" + tag.name +"_str.txt" filehandle[tag.name] = "handle_" + tag.name vars()[filehandle[tag.name]] = open(tagStrFile, 'w+') #declare the file handle tagslist.append(tag.name) filehandle[tag.name] = vars()[filehandle[tag.name]] str = (repr(tag.string)) str = str[2:-1] str = reUni.sub('', str) str = reSpace.sub(' ', str) if str == '': continue filehandle[tag.name].write(str) filehandle[tag.name].write("\n") for tag in tagslist: #close all the files filehandle[tag].close()

INTRODUCTION SETUP \xa0STRUCTURE \n OF THE ORGANISATION The Category wise position as on 31-03-2012 of the Sanctioned Strength \n and the Vacant Posts. Sr.No. Name \n of the Post/Designation Sanctioned \n Strength

1条回答

网友

1楼 · 发布于 2024-04-25 20:04:13

要将多个空格（包括不间断空格）折叠为一个，只需一个正则表达式：

re.sub(ur'[\s\xa0]+', u' ', samplestr)

演示：

>>> import re
>>> samplestr = u'\xa9 Copyright 2009-10 \n\t\t\t\t All Rights Reserved. (Best viewed in 1024x768 \n\t\t\t\tresolution & IE 6.0)                    break\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 \nChief Engineer'
>>> re.sub(ur'[\s\xa0]+', u' ', samplestr)
u'\xa9 Copyright 2009-10 All Rights Reserved. (Best viewed in 1024x768 resolution & IE 6.0) break Chief Engineer'

相关问题更多 >

编程相关推荐

热门问题

热门文章