Last active
December 21, 2015 17:49
-
-
Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.
Revisions
-
mikecarroll revised this gist
Jun 5, 2014 . 1 changed file with 17 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,27 +1,39 @@ desc "Normalize Uzbek file." task :uzbek => :environment do require 'csv' counter = 0 entry_counter = 0 previous = nil desc_array = [] new_file = File.open("/Users/woodchip/fixed_UE.txt", 'w') CSV.foreach("/Users/woodchip/UE.txt", { :col_sep => "|" }) do |row| if row.count > 2 row[1] = row[1] + row[2] row.pop end unless row.empty? entry = row[0].strip # p "ENTRY: #{entry}" pos = entry.match(/\w*\.$/).to_s.strip # p "PoS: #{pos}" entry = entry.gsub(/\w*\.$/, '').strip cyrillic = entry.gsub(/\w/u,'').strip # p "CYRILLIC: #{cyrillic}" latin = entry.gsub(/[Ѐ-ӿ\d]+\b/i,'').strip # p "LATIN: #{latin}" desc_array << ["<i>#{pos}</i>", row[1].strip] # p "DESC ARRAY: #{desc_array}" formatted_entry = "#{latin} / #{cyrillic}".gsub(/(^\!|\s\-|\-\s|\s\'|^\/|\/$|\'\s|\s\’|\’\s|\s\,|\,\s|^\?|\'$|\s\!|\s\?)|\([\ -\/]*\)/, '').strip # puts formatted_entry if previous.blank? || previous == entry previous = entry # p "#{previous} || #{entry}" else # p "#{previous} || #{entry}" new_entry = "<b>#{formatted_entry}</b>" hash = {} desc_array.each do |y| if hash[y[0]] @@ -37,6 +49,7 @@ new_entry << "; #{k} #{v}" end end # p new_entry new_file << new_entry + "\n" # p new_entry.encoding.name desc_array = [] -
mikecarroll created this gist
Aug 26, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,56 @@ desc "Normalize Uzbek file." task :uzbek => :environment do counter = 0 entry_counter = 0 previous = nil desc_array = [] new_file = File.open('~/fixed_UE.txt', 'w') CSV.foreach("~/UE.txt", { :col_sep => "|" }) do |row| if row.count > 2 row[1] = row[1] + row[2] row.pop end unless row.empty? entry = row[0].strip pos = entry.match(/\w*\.$/).to_s.strip entry = entry.gsub(/\w*\.$/, '').strip desc_array << [pos, row[1].strip] if previous.blank? || previous == entry previous = entry # p "#{previous} || #{entry}" else # p "#{previous} || #{entry}" new_entry = entry.dup hash = {} desc_array.each do |y| if hash[y[0]] hash[y[0]] = hash[y[0]] << ", #{y[1]}" else hash[y[0]] = y[1] end end hash.each_with_index do |(k, v), i| if i == 0 new_entry << " #{k} #{v}" else new_entry << "; #{k} #{v}" end end new_file << new_entry + "\n" # p new_entry.encoding.name desc_array = [] previous = entry entry_counter += 1 end counter += 1 if counter%1000 == 0 p counter end end end new_file.close p "Done! ENTRIES COUNT: #{entry_counter}" p "Done! TOTAL COUNT: #{counter}" end