如何用Beautiful Soup 4获取<font>内<td>中的文本
我该如何使用Beautiful Soup 4来解析这个内容,当它不识别td.font.unwrap()
时?我应该替换它还是别的什么?或者直接解开它?
<td align="CENTER">
<font size="+2">横</font>(F
<font size="+2">橫</font>)
</td>
我想要获取的字符串是横(F橫)
,但我现在得到的是:横(F�)
我可以正常调用
y = cols[1].text
cols
是<td>
字段,而这是行<tr>
中的第二个。
完整代码如下:
# coding: utf8
from pysqlite2 import dbapi2 as sqlite3
import urllib2
from bs4 import BeautifulSoup
from string import *
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()
# # create a table
def createTable():
cursor.execute("""CREATE TABLE characters
(rank INTEGER, word TEXT, definition TEXT)
""")
def insertChar(rank,word,definition):
cursor.execute("""INSERT INTO characters (rank,word,definition)
VALUES (?,?,?)""",(rank,word,definition))
def main():
createTable()
# u = unicode("辣", "utf-8")
# insertChar(1,u,"123123123")
# content = "\n".join(response.readlines()[1:])
soup = BeautifulSoup(urllib2.urlopen('http://www.zein.se/patrick/3000char.html').read())
# print (html_doc.prettify())
tables = soup.blockquote.table
# print tables
rows = tables.find_all('tr')[1:]
result=[]
for tr in rows:
# print tr
cols = tr.find_all('td')
character = []
# col = cols.fonts.unwrap()
# x = int (cols[0].string)
x = 0
y = cols[1].text
# chars = y.find_all('font')
z = "11"
print y
# y = cols[1].string
# z = cols[2].string
# xx = unicode(x, "utf-8")
# yy = unicode(y , "utf-8")
# zz = unicode(z , "utf-8")
insertChar(x,y,z)
conn.commit()
main()
非常感谢大家的帮助!谢谢!
1 个回答
1
那个网站说它使用的是gb2312编码,但其实并不是。你可以用下面的代码来解决这个问题:
url = 'http://www.zein.se/patrick/3000char.html'
soup = bs4.BeautifulSoup(urllib2.urlopen(url).read(), from_encoding='gb18030')
或者你也可以直接用:
soup = bs4.BeautifulSoup(urllib2.urlopen(url).read(), from_encoding='gbk')
你的浏览器已经搞定了这个问题,但BeautifulSoup需要一些提示才能正确处理。