python - 如何用<em>标签替换html中的所有链接标签?
我有一个程序,它可以把一篇长文章的内容缩短成一个简短的摘要,使用的是PHP。不过,我不想在页面加载时就生成这个摘要,而是想把它存储到数据库里。
我已经把其他的都搞定了,唯一的问题是如何把链接标签替换成一个<em>文本</em>
。我不想用beautiful soup来处理这个,尽量想用简单的正则表达式替换,如果可以的话,或者用HTMLParser的其他功能(我在网上搜索了一段时间也没找到合适的)。
这是我现在的代码:
import HTMLParser, string, re
tag_end_re = re.compile(r'(\w+)[^>]*>')
entity_end_re = re.compile(r'(\w+;)')
class StrippingParser(HTMLParser.HTMLParser):
# These are the HTML tags that we will leave intact
valid_tags = ('a')
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.result = ""
self.endTagList = []
def handle_data(self, data):
if data:
self.result = self.result + data
def handle_charref(self, name):
self.result = "%s&#%s;" % (self.result, name)
def handle_entityref(self, name):
if self.entitydefs.has_key(name):
x = ';'
else:
# this breaks unstandard entities that end with ';'
x = ''
self.result = "%s&%s%s" % (self.result, name, x)
def handle_starttag(self, tag, attrs):
""" Delete all tags except for legal ones """
if tag in self.valid_tags:
self.result = self.result + '<' + tag
for k, v in attrs:
if string.lower(k[0:2]) != 'on' and string.lower(v[0:10]) != 'javascript':
self.result = '%s %s="%s"' % (self.result, k, v)
endTag = '</%s>' % tag
self.endTagList.insert(0,endTag)
self.result = self.result + '>'
def handle_endtag(self, tag):
if tag in self.valid_tags:
self.result = "%s</%s>" % (self.result, tag)
remTag = '</%s>' % tag
self.endTagList.remove(remTag)
def cleanup(self):
""" Append missing closing tags """
for j in range(len(self.endTagList)):
self.result = self.result + self.endTagList[j]
def strip(s):
""" Strip illegal HTML tags from string s """
parser = StrippingParser()
parser.feed(s)
parser.close()
parser.cleanup()
return parser.result
def truncate_html(string, length, ellipsis='...'):
"""Truncate HTML string, preserving tag structure and character entities."""
length = int(length)
output_length = 0
i = 0
pending_close_tags = {}
while output_length < length and i < len(string):
c = string[i]
if c == '<':
# probably some kind of tag
if i in pending_close_tags:
# just pop and skip if it's closing tag we already knew about
i += len(pending_close_tags.pop(i))
else:
# else maybe add tag
i += 1
match = tag_end_re.match(string[i:])
if match:
tag = match.groups()[0]
i += match.end()
# save the end tag for possible later use if there is one
match = re.search(r'(</' + tag + '[^>]*>)', string[i:], re.IGNORECASE)
if match:
pending_close_tags[i + match.start()] = match.groups()[0]
else:
output_length += 1 # some kind of garbage, but count it in
elif c == '&':
# possible character entity, we need to skip it
i += 1
match = entity_end_re.match(string[i:])
if match:
i += match.end()
# this is either a weird character or just '&', both count as 1
output_length += 1
else:
# plain old characters
skip_to = string.find('<', i, i + length)
if skip_to == -1:
skip_to = string.find('&', i, i + length)
if skip_to == -1:
skip_to = i + length
# clamp
delta = min(skip_to - i,
length - output_length,
len(string) - i)
output_length += delta
i += delta
output = [string[:i]]
if output_length == length:
output.append(ellipsis)
for k in sorted(pending_close_tags.keys()):
output.append(pending_close_tags[k])
return "".join(output)
def summarize(contents,length):
summary = strip(contents)
summary = truncate_html(summary,length)
return summary
现在调用summarize('我的长文本,里面有<a href="" target="_blank">链接</a>',400)
会返回一个400个字符的摘要(不包括HTML标签),里面还包含链接。我需要帮助把这些链接替换成标签。
1 个回答
0
你可以用一个简单的正则表达式来解决这个问题:
import re
'<em>' + re.search(r'<a.+>(.+)</a>',r'my long text with <a href="" target="_blank">links</a>').group(1) + '</em>'