#! /usr/bin/python2.7 # -*- coding: utf-8 -*- """ The `.smi` files should be in the `./smi` folder. Extracted text will be contained in `.txt` files and located in a `./txt' folder. """ import html5lib import os from glob import glob directory = '''./smi/''' xpaths = "//body//text()" def get_filenames(directory): return glob(os.path.join(directory, '*')) def get_xpaths(filename): with open(filename, 'r') as f: p = html5lib.HTMLParser(\ tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ namespaceHTMLElements=False) page = p.parse(f) xp = page.xpath(xpaths) return xp def print_lines(filename, lines): with open(filename, 'w') as f: for line in lines: f.write(line) filenames = get_filenames(directory) for oldfile in filenames: newfile = 'txt' + oldfile[5:-4] + '.txt' print 'processing ' + oldfile lines = get_xpaths(oldfile) encoded = (line.encode('utf-8') for line in lines) print_lines(newfile, encoded) print 'done'