-
-
Save cornchz/3552576 to your computer and use it in GitHub Desktop.
Revisions
-
cornchz revised this gist
Aug 31, 2012 . 1 changed file with 1 addition and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,8 +28,7 @@ def get_xpaths(filename): def print_lines(filename, lines): with open(filename, 'w') as f: for line in lines: f.write(line) filenames = get_filenames(directory) -
cornchz revised this gist
Aug 31, 2012 . 1 changed file with 37 additions and 37 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,42 +1,42 @@ #! /usr/bin/python2.7 # -*- coding: utf-8 -*- """ The `.smi` files should be in the `./smi` folder. Extracted text will be contained in `.txt` files and located in a `./txt' folder. """ import html5lib import os from glob import glob directory = '''./smi/''' xpaths = "//body//text()" def get_filenames(directory): return glob(os.path.join(directory, '*')) def get_xpaths(filename): with open(filename, 'r') as f: p = html5lib.HTMLParser(\ tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ namespaceHTMLElements=False) page = p.parse(f) xp = page.xpath(xpaths) return xp def print_lines(filename, lines): with open(filename, 'w') as f: for line in lines: line = line.encode('utf-8') f.write(d) filenames = get_filenames(directory) for oldfile in filenames: newfile = 'txt' + oldfile[5:-4] + '.txt' print 'processing ' + oldfile lines = get_xpaths(oldfile) encoded = (line.encode('utf-8') for line in lines) print_lines(newfile, encoded) print 'done' -
e9t revised this gist
Aug 31, 2012 . 1 changed file with 5 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,17 +13,16 @@ directory = '''./smi/''' xpaths = "//body//text()" def get_filenames(directory): return glob(os.path.join(directory, '*')) def get_xpaths(filename): with open(filename, 'r') as f: p = html5lib.HTMLParser(\ tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ namespaceHTMLElements=False) page = p.parse(f) xp = page.xpath(xpaths) return xp def print_txt(filename, data): @@ -33,11 +32,11 @@ def print_txt(filename, data): f.write(d) filenames = get_filenames(directory) for f in filenames: print 'processing ' + f xp = get_xpaths(f) f = 'txt' + f[5:-4] + '.txt' print_txt(f, xp) print 'done' -
e9t created this gist
Aug 31, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,43 @@ #! /usr/bin/python2.7 # -*- coding: utf-8 -*- """ The `.smi` files should be in the `./smi` folder. Extracted text will be contained in `.txt` files and located in a `./txt' folder. """ import html5lib import os from glob import glob directory = '''./smi/''' xpaths = "//body//text()" def getfilenames(directory): return glob(os.path.join(directory, '*')) def getxpaths(filename): with open(filename, 'r') as f: p = html5lib.HTMLParser(\ tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ namespaceHTMLElements=False) page = p.parse(f) xp = page.xpath(xpaths) return xp def print_txt(filename, data): with open(filename, 'w') as f: for d in data: d = d.encode('utf-8') f.write(d) filenames = getfilenames(directory) for f in filenames: print 'processing ' + f xp = getxpaths(f) f = 'txt' + f[5:-4] + '.txt' print_txt(f, xp) print 'done'