Index.html for Directory of EML-files

Hi, as I often have to export emails from my imap account to my hard drive, I have many folders filled with thousands of .eml-Files. Unfortunately, searching for information by hand or by board tools is unsatisfactory. Therefore I wanted to have a file containing some of the email headers so I only would have to search one file and get all subjects, senders or receivers etc. The plugin I was using for exporting has such an ability but only takes the exported emails into account, not the mails which were already there. So I decided it would be easier to write a small script which would create such an index file for an complete folder. Here it is. Straight forward besides the encoding, I think.

#!python
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
from email.parser import Parser
import email.utils
import time
from email.header import decode_header
from urllib import pathname2url
import codecs

def headers_to_unicode(email_obj, header_name):
    header_tuple_list = decode_header(email_obj[header_name])
    header_list = []
    for header in header_tuple_list:
        (string, charset) = header
        #bring the string into unicode
        if not charset is None:
            ustring = unicode(string, charset)
        else:
            ustring = unicode(string, "ascii")
        header_list.append(ustring)
    return " ".join(header_list)

def intro(file):
    file.write(u"""<html>
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
    <title>""" + os.path.dirname(os.path.realpath(unicode(__file__, "utf-8"))) + u"""</title>
    </head>
    <body>
    <h2>""" + os.path.dirname(os.path.realpath(unicode(__file__, "utf-8"))) + u"""</h2>
    <table width="99%" border="1">
        <tr>
            <td><b>Betreff</b></td>
            <td><b>Von</b></td>
            <td><b>An</b></td>
            <td><b>Datum</b></td>
        </tr>
        <tr>
            <td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>
        </tr>""")

def outro(file):
    file.write(u"</table></body></html>")

def row(file, fro, to, date, subject, filepath):
    file.write(u'<tr><td><a href="' + filepath + u'">' + subject + u'</a></td>')
    file.write(u"<td>" + fro + u"</td>")
    file.write(u"<td>" + to + u"</td>")
    file.write(u"<td>" + date + u"</td></tr>\n")

def build_mail_list_from_dir(dir_name):
    #Build list of email objects from the files on harddrive
    email_list = []
    for (path, dirs, files) in os.walk(dir_name):
        for email_filename in files:
            if email_filename.endswith(u".eml"):
                with open(os.path.join(path, email_filename)) as email_file:
                    email_obj = Parser().parse(email_file, headersonly=True)
                    email_obj.priv_date = time.mktime(email.utils.parsedate(email_obj['date']))
                    email_obj.priv_path = unicode(pathname2url(os.path.join("Emls/", email_filename.encode(u"utf8"))), "utf8")
                    email_list.append(email_obj)
    #sort them by date
    email_list.sort(key=lambda email_obj: email_obj.priv_date)
    return email_list

def write_index_from_mail_list(email_list):
    #Generate Index file
    with codecs.open(u"index.html", encoding=u'utf-8', mode=u"w", errors=u"xmlcharrefreplace") as index:
        intro(index)
        for email_obj in email_list:
            #read header, use decode_header to get tuple with encoding and string (international headers!)
            to = headers_to_unicode(email_obj, "to")
            fro = headers_to_unicode(email_obj, "from")
            subject = headers_to_unicode(email_obj, "subject")
            filepath = email_obj.priv_path
            date = unicode(time.asctime(time.gmtime(email_obj.priv_date)), u"utf8")
            row(index, fro, to, date, subject, filepath)
        outro(index)

def main():
    email_list = build_mail_list_from_dir(u"./Emls")
    write_index_from_mail_list(email_list)

if __name__==u"__main__":
    main()

Index.html for Directory of EML-files

Published

Category

Tags