从mbox写入html文件

import mailbox the_dir = "/path/to/file" mbox = mailbox.mbox(the_dir + "12394334.mbox") html_header = """<!DOCTYPE html> <html> <head> <title>Email message</title> </head> <body>""" html_footer = '</body></html>' for message in mbox: mess_from = message['from'] subject = message['subject'] time_received = message['date'] if message.is_multipart(): content = ''.join(str(part.get_payload(decode=True)) for part in message.get_payload()) else: content = message.get_payload(decode=True) content = str(content)[2:].replace('\\n', ' ') subject.replace('/', '-') fname = subject + " " + time_received + '.html' with open(the_dir + 'html/' + fname , 'w') as the_file: the_file.write(html_header) the_file.write(' ' + 'From: ' + mess_from) the_file.write(' ' + 'Subject: ' + subject) the_file.write(' ' + 'Received: ' + time_received + ' ') the_file.write(content)

1条回答

网友

1楼 · 发布于 2024-05-14 20:03:58

我找到了这个问题的答案

首先，我需要通过子类型（part.get\u content\u subtype（））来标识html。这就是我知道我有一个html子类型的原因

然后我需要使用part.get_charsets（）获取字符集。有一个part.get\u charset（）但它总是返回None，所以我使用get\u charset（）的第一个元素

get_有效负载似乎是bass ackward，参数decode=True表示它不会对有效负载进行解码。然后，我使用前面得到的字符集对消息进行解码。否则，我将使用decode=False对其进行解码

如果是文本，我去掉换行符等，添加一个html标题，然后写入文件

下一份工作

使用BeautifulSoup将发件人信息/主题添加到
了解如何处理附件并将html文件链接到附件
有些字符仍然没有显示，如“£”等

正文

import mailbox

the_dir = "/path/to/mbox/"

mbox = mailbox.mbox(the_dir + "12394334.mbox")

html_footer = "</body></html>"
html_flag = False

for message in mbox:

mess_from = message['from']
subject = message['subject']
time_received = message['date']
fname = subject + " " + time_received
fname = fname.replace('/', '-')

if message.is_multipart():
    contents_text = []
    contents_html = []
    for part in message.walk():
        maintype = part.get_content_maintype()
        subtype = part.get_content_subtype()
        if maintype == 'multipart' or maintype == 'message':
            # Reject containers
            continue
        if subtype == 'html':
            enc = part.get_charsets()
            if enc[0] is not None:
                contents_html.append(part.get_payload(decode=True).decode(enc[0]))
            else:
                contents_html.append(part.get_payload(decode=False))
        elif subtype == 'text':
            contents_text.append(part.get_payload(decode=False))
        else:       #I will use this to process attachmnents in the future
            continue
        
    if len(contents_html)> 0:
        if len(contents_html)>1:
            print('multiple html')      #This hasn't happened yet
        html_flag = True
        content = '\n\n'.join(contents_html)
          
    else:
        html_flag = False
else:
    content = message.get_payload(decode=False) 
    content = content.replace('\\n', '<br/>')
    content = content.replace('=\n', '<br/>')        
    content = content.replace('\n', '<br/>')
    content = content.replace('=20', '')
    html_header = f""" <!DOCTYPE html>
    <html>
    <head>
    <title>{fname}</title>
    </head>
    <body>"""      
    content = (html_header + '<br/>' + 
               'From: ' + mess_from + '<br/>' 
               + 'Subject: ' + subject + '<br/>' + 
               'Received: ' + time_received + '<br/><br/>' + 
               content + html_footer)


with open(the_dir + "html/" + fname + ".html", "w") as the_file:
    the_file.write(content)

打印（'Done！'）

相关问题更多 >

编程相关推荐

热门问题

热门文章