import sys import os import re import codecs from lxml import etree def parse_big_xml(filename, item_tag=u'item'): with codecs.open(filename, 'r', 'utf-8') as in_file: output = u'' for line in in_file: if line == u'<%s>' % (item_tag,): output = line elif line == u'' % (item_tag,): output += line yield lxml.etree.fromstring(output) else: output += line if __name == '__main__': for item in (parse_big_xml(sys.argv[1], sys.argv[2])): print item