Skip to content

Instantly share code, notes, and snippets.

@cornchz
Forked from e9t/README.md
Created August 31, 2012 13:20
Show Gist options
  • Select an option

  • Save cornchz/3552576 to your computer and use it in GitHub Desktop.

Select an option

Save cornchz/3552576 to your computer and use it in GitHub Desktop.

Revisions

  1. cornchz revised this gist Aug 31, 2012. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions convert.py
    Original file line number Diff line number Diff line change
    @@ -28,8 +28,7 @@ def get_xpaths(filename):
    def print_lines(filename, lines):
    with open(filename, 'w') as f:
    for line in lines:
    line = line.encode('utf-8')
    f.write(d)
    f.write(line)

    filenames = get_filenames(directory)

  2. cornchz revised this gist Aug 31, 2012. 1 changed file with 37 additions and 37 deletions.
    74 changes: 37 additions & 37 deletions convert.py
    Original file line number Diff line number Diff line change
    @@ -1,42 +1,42 @@
    #! /usr/bin/python2.7
    # -*- coding: utf-8 -*-
    #! /usr/bin/python2.7
    # -*- coding: utf-8 -*-

    """
    The `.smi` files should be in the `./smi` folder.
    Extracted text will be contained in `.txt` files and located in a `./txt' folder.
    """
    import html5lib
    import os
    from glob import glob
    directory = '''./smi/'''
    xpaths = "//body//text()"
    def get_filenames(directory):
    return glob(os.path.join(directory, '*'))
    def get_xpaths(filename):
    with open(filename, 'r') as f:
    p = html5lib.HTMLParser(\
    tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
    namespaceHTMLElements=False)
    page = p.parse(f)
    xp = page.xpath(xpaths)
    return xp
    def print_txt(filename, data):
    with open(filename, 'w') as f:
    for d in data:
    d = d.encode('utf-8')
    f.write(d)
    filenames = get_filenames(directory)
    for f in filenames:
    print 'processing ' + f
    xp = get_xpaths(f)
    f = 'txt' + f[5:-4] + '.txt'
    print_txt(f, xp)
    """

    import html5lib
    import os
    from glob import glob

    directory = '''./smi/'''
    xpaths = "//body//text()"

    def get_filenames(directory):
    return glob(os.path.join(directory, '*'))

    def get_xpaths(filename):
    with open(filename, 'r') as f:
    p = html5lib.HTMLParser(\
    tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
    namespaceHTMLElements=False)
    page = p.parse(f)
    xp = page.xpath(xpaths)
    return xp

    def print_lines(filename, lines):
    with open(filename, 'w') as f:
    for line in lines:
    line = line.encode('utf-8')
    f.write(d)

    filenames = get_filenames(directory)

    for oldfile in filenames:
    newfile = 'txt' + oldfile[5:-4] + '.txt'
    print 'processing ' + oldfile
    lines = get_xpaths(oldfile)
    encoded = (line.encode('utf-8') for line in lines)
    print_lines(newfile, encoded)
    print 'done'
  3. @e9t e9t revised this gist Aug 31, 2012. 1 changed file with 5 additions and 6 deletions.
    11 changes: 5 additions & 6 deletions convert.py
    Original file line number Diff line number Diff line change
    @@ -13,17 +13,16 @@
    directory = '''./smi/'''
    xpaths = "//body//text()"

    def getfilenames(directory):
    def get_filenames(directory):
    return glob(os.path.join(directory, '*'))

    def getxpaths(filename):
    def get_xpaths(filename):
    with open(filename, 'r') as f:
    p = html5lib.HTMLParser(\
    tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
    namespaceHTMLElements=False)
    page = p.parse(f)
    xp = page.xpath(xpaths)

    xp = page.xpath(xpaths)
    return xp

    def print_txt(filename, data):
    @@ -33,11 +32,11 @@ def print_txt(filename, data):
    f.write(d)


    filenames = getfilenames(directory)
    filenames = get_filenames(directory)

    for f in filenames:
    print 'processing ' + f
    xp = getxpaths(f)
    xp = get_xpaths(f)
    f = 'txt' + f[5:-4] + '.txt'
    print_txt(f, xp)
    print 'done'
  4. @e9t e9t created this gist Aug 31, 2012.
    43 changes: 43 additions & 0 deletions convert.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,43 @@
    #! /usr/bin/python2.7
    # -*- coding: utf-8 -*-

    """
    The `.smi` files should be in the `./smi` folder.
    Extracted text will be contained in `.txt` files and located in a `./txt' folder.
    """

    import html5lib
    import os
    from glob import glob

    directory = '''./smi/'''
    xpaths = "//body//text()"

    def getfilenames(directory):
    return glob(os.path.join(directory, '*'))

    def getxpaths(filename):
    with open(filename, 'r') as f:
    p = html5lib.HTMLParser(\
    tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
    namespaceHTMLElements=False)
    page = p.parse(f)
    xp = page.xpath(xpaths)

    return xp

    def print_txt(filename, data):
    with open(filename, 'w') as f:
    for d in data:
    d = d.encode('utf-8')
    f.write(d)


    filenames = getfilenames(directory)

    for f in filenames:
    print 'processing ' + f
    xp = getxpaths(f)
    f = 'txt' + f[5:-4] + '.txt'
    print_txt(f, xp)
    print 'done'