Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save modond/8802375 to your computer and use it in GitHub Desktop.

Select an option

Save modond/8802375 to your computer and use it in GitHub Desktop.
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'mtsw-test-messages.mbox'
OUT_FILE = 'mtsw-test-messages.mbox.json'
def cleanContent(msg):
# Decode message from "quoted printable" format
msg = quopri.decodestring(msg)
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
# The generator itself...
def gen_json_msgs(mb):
msg_number = 0
while 1:
msg = mb.next()
if msg is None:
break
print >> sys.stderr, "Processing message number", msg_number
sys.stderr.flush()
yield jsonifyMessage(msg)
msg_number += 1
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() == 'multipart':
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
try:
json_part['content'] = cleanContent(content)
except:
print >> sys.stderr, "Caught Exception during cleanContent(). Displaying content..."
print >> sys.stderr, "-"*30
print >> sys.stderr, content
print >> sys.stderr, "-"*30
# XXX: Note that you could execute the following commented out line instead of re-raising to ignore this error
# json_part['content'] = u"ERROR PROCESSING THIS CONTENT"
# Comment out this line if you uncomment the one above
raise
json_msg['parts'].append(json_part)
# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
print "All done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment