cornchz · August 31, 2012 13:20 · Aug 31, 2012 · Aug 31, 2012 · Aug 31, 2012 · Aug 31, 2012
diff --git a/convert.py b/convert.py
@@ -28,8 +28,7 @@ def get_xpaths(filename):
 def print_lines(filename, lines):
     with open(filename, 'w') as f:
         for line in lines:
-            line = line.encode('utf-8')
-            f.write(d)
+            f.write(line)
 
 filenames = get_filenames(directory)
 

diff --git a/convert.py b/convert.py
@@ -1,42 +1,42 @@
-#! /usr/bin/python2.7                                                                                                                                         
-# -*- coding: utf-8 -*- 
+#! /usr/bin/python2.7
+# -*- coding: utf-8 -*-
 
 """
 The `.smi` files should be in the `./smi` folder.
 Extracted text will be contained in `.txt` files and located in a `./txt' folder.
-"""                                                                                                                                    
-                                                                                                                                                              
-import html5lib                                                                                                                                               
-import os                                                                                                                                                     
-from glob import glob                                                                                                                                         
-                                                                                                                                                              
-directory = '''./smi/'''                                                                                                                                      
-xpaths = "//body//text()"                                                                                                                                     
-                                                                                                                                                              
-def get_filenames(directory):                                                                                                                                  
-    return glob(os.path.join(directory, '*'))                                                                                                                 
-                                                                                                                                                              
-def get_xpaths(filename):                                                                                                                                      
-    with open(filename, 'r') as f:                                                                                                                            
-        p = html5lib.HTMLParser(\                                                                                                                             
-                tree=html5lib.treebuilders.getTreeBuilder("lxml"),\                                                                                           
-                namespaceHTMLElements=False)                                                                                                                  
-        page = p.parse(f)                                                                                                                                     
-        xp = page.xpath(xpaths)                                                                                                                                                                                                                                                                                             
-        return xp                                                                                                                                             
-                                                                                                                                                              
-def print_txt(filename, data):                                                                                                                                
-    with open(filename, 'w') as f:                                                                                                                            
-        for d in data:                                                                                                                                        
-            d = d.encode('utf-8')                                                                                                                             
-            f.write(d)                                                                                                                                        
-                                                                                                                                                              
-                                                                                                                                                              
-filenames = get_filenames(directory)                                                                                                                           
-                                                                                                                                                              
-for f in filenames:                                                                                                                                           
-    print 'processing ' + f                                                                                                                                   
-    xp = get_xpaths(f)                                                                                                                                         
-    f = 'txt' + f[5:-4] + '.txt'                                                                                                                              
-    print_txt(f, xp)                                                                                                                                          
+"""
+
+import html5lib
+import os
+from glob import glob
+
+directory = '''./smi/'''
+xpaths = "//body//text()"
+
+def get_filenames(directory):
+    return glob(os.path.join(directory, '*'))
+
+def get_xpaths(filename):
+    with open(filename, 'r') as f:
+        p = html5lib.HTMLParser(\
+                tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
+                namespaceHTMLElements=False)
+        page = p.parse(f)
+        xp = page.xpath(xpaths)
+        return xp
+
+def print_lines(filename, lines):
+    with open(filename, 'w') as f:
+        for line in lines:
+            line = line.encode('utf-8')
+            f.write(d)
+
+filenames = get_filenames(directory)
+
+for oldfile in filenames:
+    newfile = 'txt' + oldfile[5:-4] + '.txt'
+    print 'processing ' + oldfile
+    lines = get_xpaths(oldfile)    
+    encoded = (line.encode('utf-8') for line in lines)
+    print_lines(newfile, encoded)
     print 'done'
diff --git a/convert.py b/convert.py
@@ -13,17 +13,16 @@
 directory = '''./smi/'''                                                                                                                                      
 xpaths = "//body//text()"                                                                                                                                     
 
-def getfilenames(directory):                                                                                                                                  
+def get_filenames(directory):                                                                                                                                  
     return glob(os.path.join(directory, '*'))                                                                                                                 
 
-def getxpaths(filename):                                                                                                                                      
+def get_xpaths(filename):                                                                                                                                      
     with open(filename, 'r') as f:                                                                                                                            
         p = html5lib.HTMLParser(\                                                                                                                             
                 tree=html5lib.treebuilders.getTreeBuilder("lxml"),\                                                                                           
                 namespaceHTMLElements=False)                                                                                                                  
         page = p.parse(f)                                                                                                                                     
-        xp = page.xpath(xpaths)                                                                                                                               
-
+        xp = page.xpath(xpaths)                                                                                                                                                                                                                                                                                             
         return xp                                                                                                                                             
 
 def print_txt(filename, data):                                                                                                                                
@@ -33,11 +32,11 @@ def print_txt(filename, data):
             f.write(d)                                                                                                                                        
 
 
-filenames = getfilenames(directory)                                                                                                                           
+filenames = get_filenames(directory)                                                                                                                           
 
 for f in filenames:                                                                                                                                           
     print 'processing ' + f                                                                                                                                   
-    xp = getxpaths(f)                                                                                                                                         
+    xp = get_xpaths(f)                                                                                                                                         
     f = 'txt' + f[5:-4] + '.txt'                                                                                                                              
     print_txt(f, xp)                                                                                                                                          
     print 'done'
diff --git a/convert.py b/convert.py
@@ -0,0 +1,43 @@
+#! /usr/bin/python2.7                                                                                                                                         
+# -*- coding: utf-8 -*- 
+
+"""
+The `.smi` files should be in the `./smi` folder.
+Extracted text will be contained in `.txt` files and located in a `./txt' folder.
+"""                                                                                                                                    
+
+import html5lib                                                                                                                                               
+import os                                                                                                                                                     
+from glob import glob                                                                                                                                         
+
+directory = '''./smi/'''                                                                                                                                      
+xpaths = "//body//text()"                                                                                                                                     
+
+def getfilenames(directory):                                                                                                                                  
+    return glob(os.path.join(directory, '*'))                                                                                                                 
+
+def getxpaths(filename):                                                                                                                                      
+    with open(filename, 'r') as f:                                                                                                                            
+        p = html5lib.HTMLParser(\                                                                                                                             
+                tree=html5lib.treebuilders.getTreeBuilder("lxml"),\                                                                                           
+                namespaceHTMLElements=False)                                                                                                                  
+        page = p.parse(f)                                                                                                                                     
+        xp = page.xpath(xpaths)                                                                                                                               
+
+        return xp                                                                                                                                             
+
+def print_txt(filename, data):                                                                                                                                
+    with open(filename, 'w') as f:                                                                                                                            
+        for d in data:                                                                                                                                        
+            d = d.encode('utf-8')                                                                                                                             
+            f.write(d)                                                                                                                                        
+
+
+filenames = getfilenames(directory)                                                                                                                           
+
+for f in filenames:                                                                                                                                           
+    print 'processing ' + f                                                                                                                                   
+    xp = getxpaths(f)                                                                                                                                         
+    f = 'txt' + f[5:-4] + '.txt'                                                                                                                              
+    print_txt(f, xp)                                                                                                                                          
+    print 'done'
No results found