Hi, as I often have to export emails from my imap account to my hard drive, I have many folders filled with thousands of .eml-Files. Unfortunately, searching for information by hand or by board tools is unsatisfactory. Therefore I wanted to have a file containing some of the email headers so I only would have to search one file and get all subjects, senders or receivers etc. The plugin I was using for exporting has such an ability but only takes the exported emails into account, not the mails which were already there. So I decided it would be easier to write a small script which would create such an index file for an complete folder. Here it is. Straight forward besides the encoding, I think.
#!python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from email.parser import Parser
import email.utils
import time
from email.header import decode_header
from urllib import pathname2url
import codecs
def headers_to_unicode(email_obj, header_name):
header_tuple_list = decode_header(email_obj[header_name])
header_list = []
for header in header_tuple_list:
(string, charset) = header
#bring the string into unicode
if not charset is None:
ustring = unicode(string, charset)
else:
ustring = unicode(string, "ascii")
header_list.append(ustring)
return " ".join(header_list)
def intro(file):
file.write(u"""<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>""" + os.path.dirname(os.path.realpath(unicode(__file__, "utf-8"))) + u"""</title>
</head>
<body>
<h2>""" + os.path.dirname(os.path.realpath(unicode(__file__, "utf-8"))) + u"""</h2>
<table width="99%" border="1">
<tr>
<td><b>Betreff</b></td>
<td><b>Von</b></td>
<td><b>An</b></td>
<td><b>Datum</b></td>
</tr>
<tr>
<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
</tr>""")
def outro(file):
file.write(u"</table></body></html>")
def row(file, fro, to, date, subject, filepath):
file.write(u'<tr><td><a href="' + filepath + u'">' + subject + u'</a></td>')
file.write(u"<td>" + fro + u"</td>")
file.write(u"<td>" + to + u"</td>")
file.write(u"<td>" + date + u"</td></tr>\n")
def build_mail_list_from_dir(dir_name):
#Build list of email objects from the files on harddrive
email_list = []
for (path, dirs, files) in os.walk(dir_name):
for email_filename in files:
if email_filename.endswith(u".eml"):
with open(os.path.join(path, email_filename)) as email_file:
email_obj = Parser().parse(email_file, headersonly=True)
email_obj.priv_date = time.mktime(email.utils.parsedate(email_obj['date']))
email_obj.priv_path = unicode(pathname2url(os.path.join("Emls/", email_filename.encode(u"utf8"))), "utf8")
email_list.append(email_obj)
#sort them by date
email_list.sort(key=lambda email_obj: email_obj.priv_date)
return email_list
def write_index_from_mail_list(email_list):
#Generate Index file
with codecs.open(u"index.html", encoding=u'utf-8', mode=u"w", errors=u"xmlcharrefreplace") as index:
intro(index)
for email_obj in email_list:
#read header, use decode_header to get tuple with encoding and string (international headers!)
to = headers_to_unicode(email_obj, "to")
fro = headers_to_unicode(email_obj, "from")
subject = headers_to_unicode(email_obj, "subject")
filepath = email_obj.priv_path
date = unicode(time.asctime(time.gmtime(email_obj.priv_date)), u"utf8")
row(index, fro, to, date, subject, filepath)
outro(index)
def main():
email_list = build_mail_list_from_dir(u"./Emls")
write_index_from_mail_list(email_list)
if __name__==u"__main__":
main()