提高多个string.replace语句的可读性
当我在Python中处理HTML代码时,由于有一些特殊字符,我必须使用以下代码。
line = string.replace(line, """, "\"")
line = string.replace(line, "'", "'")
line = string.replace(line, "&", "&")
line = string.replace(line, "<", "<")
line = string.replace(line, ">", ">")
line = string.replace(line, "«", "<<")
line = string.replace(line, "»", ">>")
line = string.replace(line, "'", "'")
line = string.replace(line, "“", "\"")
line = string.replace(line, "”", "\"")
line = string.replace(line, "‘", "\'")
line = string.replace(line, "’", "\'")
line = string.replace(line, "■", "")
line = string.replace(line, "•", "-")
看起来还有很多这样的特殊字符需要我去替换。你知道怎么让这段代码更简洁一些吗?
谢谢你
3 个回答
2
优化
REPL_tu = ((""", "\"") , ("'", "'") , ("&", "&") ,
("<", "<") , (">", ">") ,
("«", "<<") , ("»", ">>") ,
("'", "'") ,
("“", "\"") , ("”", "\"") ,
("‘", "\'") , ("’", "\'") ,
("■", "") , ("•", "-") )
def repl(mat, d = dict(REPL_tu)):
return d[mat.group()]
import re
regx = re.compile('|'.join(a for a,b in REPL_tu))
line = 'A tag <bidi> has a "weird“•'content''
modline = regx.sub(repl,line)
print 'Exemple:\n\n'+line+'\n'+modline
from urllib import urlopen
print '\n-----------------------------------------\nDownloading a web source:\n'
sock = urlopen('http://www.mythicalcreaturesworld.com/greek-mythology/monsters/python-the-serpent-of-delphi-%E2%80%93-python-the-guardian-dragon-and-apollo/')
html_source = sock.read()
sock.close()
from time import clock
n = 100
te = clock()
for i in xrange(n):
res1 = html_source
res1 = regx.sub(repl,res1)
print 'with regex ',clock()-te,'seconds'
te = clock()
for i in xrange(n):
res2 = html_source
for entity, replacement in REPL_tu:
res2 = res2.replace(entity, replacement)
print 'with replace',clock()-te,'seconds'
print res1==res2
结果
Exemple:
A tag <bidi> has a "weird“•'content'
A tag <bidi> has a "weird"-'content'
-----------------------------------------
Downloading a web source:
with regex 0.097578323502 seconds
with replace 0.213866846205 seconds
True
2
这是我之前写的一段代码,用来解码HTML实体。请注意,这段代码是针对Python 2.x的,所以它还会把str
转换成unicode
:如果你使用的是现代的Python,可以去掉这部分。我觉得这段代码可以处理所有命名实体、十进制和十六进制实体。出于某种原因,'apos'这个实体不在Python的命名实体字典里,所以我先把它复制过来,补上这个缺失的部分:
from htmlentitydefs import name2codepoint
name2codepoint = name2codepoint.copy()
name2codepoint['apos']=ord("'")
EntityPattern = re.compile('&(?:#(\d+)|(?:#x([\da-fA-F]+))|([a-zA-Z]+));')
def decodeEntities(s, encoding='utf-8'):
def unescape(match):
code = match.group(1)
if code:
return unichr(int(code, 10))
else:
code = match.group(2)
if code:
return unichr(int(code, 16))
else:
code = match.group(3)
if code in name2codepoint:
return unichr(name2codepoint[code])
return match.group(0)
if isinstance(s, str):
s = s.decode(encoding)
return EntityPattern.sub(unescape, s)
4
REPLACEMENTS = [
(""", "\""),
("'", "'"),
...
]
for entity, replacement in REPLACEMENTS:
line = line.replace(entity, replacement)
请注意,string.replace
这个方法其实就是在 str
或 unicode
对象上可以直接使用的。
更好的是,可以看看 这个问题!
不过,你的问题标题问的其实是另外一回事:优化,也就是让它运行得更快。这是一个完全不同的问题,需要更多的工作。