Skip to content

Instantly share code, notes, and snippets.

@LikeCarter
Forked from TimRepke/README.md
Last active April 18, 2020 20:21
Show Gist options
  • Select an option

  • Save LikeCarter/e4d43cb47b2d33e743d3117da5c8c465 to your computer and use it in GitHub Desktop.

Select an option

Save LikeCarter/e4d43cb47b2d33e743d3117da5c8c465 to your computer and use it in GitHub Desktop.
PST Archive to RFC822 (*.eml) script

PST Archive to RFC822

This script extracts all emails from an Outlook PST archive and saves them into some output folder as individual RFC822 compliant *.eml files.

Installing the external dependency pypff may not be straight forward (it wasn't for me). I forked the original repository to make it work in Python 3. If you get errors, check their wiki pages for help or try my fork. Below are the steps that worked for me:

Tested on MacOS with Python 3.6.

Install prereqs:

brew install autoconf automake libtool gettext
export PATH=$PATH:/usr/local/Cellar/gettext/0.20.1/bin

Then clone https://github.com/TimRepke/libpff

cd libpff/
./synclibs.sh
./autogen.sh
./configure --enable-python
python setup.py build
sudo python setup.py install

After the binary has been built, create a new file in your build directory. Mine is libpff/build/lib.macosx-10.15-x86_64-3.6/pypff.py. Add the following boilerplate:

def __bootstrap__():
   global __bootstrap__, __loader__, __file__
   import sys, pkg_resources, imp
   __file__ = pkg_resources.resource_filename(__name__,'pypff.cpython-36m-darwin.so')
   __loader__ = None; del __bootstrap__, __loader__
   imp.load_dynamic(__name__,__file__)
__bootstrap__()

You can now import it as

>>> import pypff

Now that everything is installed, you can execute the following script within the same folder as pypff.py

python3 pst2eml.py /path/to/archive.pst /path/to/output/dir

Optionally, you can write the log into a file by adding --logfile=/path/to/log_dir to the command.

Full disclaimer: I was inspired by this script, but as you may see, I pretty much threw everything overboard and made my own thing. Only kept the logging and argparse really.

import os
import argparse
import logging
import re
import pypff
def process_folder(folder, path):
folder_path = path + '/' + (folder.name or 'root')
n_msg = folder.number_of_sub_messages
logging.debug('Processing folder "{}" with {} sub-folders and {} messages; full path: "{}"'.format(
folder.name, folder.number_of_sub_folders, n_msg, folder_path))
safe_path = re.sub(r'[ /]', '_', re.sub(r'[^a-z0-9 /]', '', folder_path.lower()))
for mi, message in enumerate(folder.sub_messages):
logging.debug('{}/{} > Processing message by {} with subject: {}'.format(mi, n_msg,
message.sender_name, message.subject))
msg = process_message(message)
fname = os.path.join(output_directory, safe_path + '_' + str(mi) + '.eml')
logging.debug(' -- saving as {}'.format(fname))
with open(fname, 'w') as f:
f.write(msg)
for sub_folder in folder.sub_folders:
process_folder(sub_folder, folder_path)
def get_body(msg):
def prep(b):
if type(b) == bytes:
try:
b = b.decode("utf-8")
except UnicodeDecodeError:
return None
return b.strip() if b else None
body = prep(msg.plain_text_body)
if body:
return 'plain-text', body
body = prep(msg.html_body)
if body:
return 'html', body
body = prep(msg.rtf_body)
if body:
return 'rtf', body
return 'plain-text', ''
def process_message(message):
msg = ''
keys = []
btype, body = get_body(message)
if message.transport_headers:
for hp in message.transport_headers.split('\n'):
if(hp.startswith('Content-Type')):
# Hardcode the text/html MIME type
msg += 'Content-type: text/html' + '\r\n'
continue
pts = re.findall(r'^([^:]+): (.+)\r$', hp)
if pts:
key = pts[0][0].capitalize()
if key in keys:
key = 'X-' + key
keys.append(key)
val = pts[0][1]
if key == 'Date':
val = ', '.join(val.split(',')[:2])
msg += key + ': ' + val + '\r\n'
msg += 'X-Sender-Name: ' + message.sender_name + '\r\n'
msg += 'X-Delivery-Time: ' + str(message.delivery_time) + '\r\n'
msg += 'X-Creation-Time: ' + str(message.creation_time) + '\r\n'
msg += 'X-Client-Submit-Time: ' + str(message.client_submit_time) + '\r\n'
msg += 'X-Subject: ' + message.subject + '\r\n'
msg += 'X-Attachments: ' + str(message.number_of_attachments) + '\r\n'
msg += 'X-Body-Type: ' + btype + '\r\n'
msg += '\r\n'
msg += body
return msg
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('PST_FILE', help="PST File Format from Microsoft Outlook")
parser.add_argument('OUTPUT_DIR', help="Directory of output for temporary and report files.")
parser.add_argument('--logfile', default=None, help='File path of log file.')
args = parser.parse_args()
output_directory = os.path.abspath(args.OUTPUT_DIR)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
if args.logfile:
if not os.path.exists(args.logfile):
os.makedirs(args.logfile)
log_path = os.path.join(args.logfile, 'pst_indexer.log')
else:
log_path = None
logging.basicConfig(level=logging.DEBUG, filename=log_path,
format='%(asctime)s | %(levelname)s | %(message)s', filemode='w')
logging.info('Starting Script...')
pst_file = args.PST_FILE
try:
pff_file = pypff.file()
pff_file.open(pst_file)
process_folder(pff_file.root_folder, os.path.basename(pst_file))
except Exception as e:
raise
finally:
pff_file.close()
logging.info('Script Complete')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment