Skip to content

Instantly share code, notes, and snippets.

@mikecarroll
Last active December 21, 2015 17:49
Show Gist options
  • Select an option

  • Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.

Select an option

Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.

Revisions

  1. mikecarroll revised this gist Jun 5, 2014. 1 changed file with 17 additions and 4 deletions.
    21 changes: 17 additions & 4 deletions uzbek.rb
    Original file line number Diff line number Diff line change
    @@ -1,27 +1,39 @@
    desc "Normalize Uzbek file."
    task :uzbek => :environment do
    require 'csv'

    counter = 0
    entry_counter = 0
    previous = nil
    desc_array = []
    new_file = File.open('~/fixed_UE.txt', 'w')
    new_file = File.open("/Users/woodchip/fixed_UE.txt", 'w')

    CSV.foreach("~/UE.txt", { :col_sep => "|" }) do |row|
    CSV.foreach("/Users/woodchip/UE.txt", { :col_sep => "|" }) do |row|
    if row.count > 2
    row[1] = row[1] + row[2]
    row.pop
    end
    unless row.empty?
    entry = row[0].strip
    # p "ENTRY: #{entry}"
    pos = entry.match(/\w*\.$/).to_s.strip
    # p "PoS: #{pos}"
    entry = entry.gsub(/\w*\.$/, '').strip
    desc_array << [pos, row[1].strip]
    cyrillic = entry.gsub(/\w/u,'').strip
    # p "CYRILLIC: #{cyrillic}"
    latin = entry.gsub(/[Ѐ-ӿ\d]+\b/i,'').strip
    # p "LATIN: #{latin}"
    desc_array << ["<i>#{pos}</i>", row[1].strip]
    # p "DESC ARRAY: #{desc_array}"

    formatted_entry = "#{latin} / #{cyrillic}".gsub(/(^\!|\s\-|\-\s|\s\'|^\/|\/$|\'\s|\s\’|\’\s|\s\,|\,\s|^\?|\'$|\s\!|\s\?)|\([\ -\/]*\)/, '').strip
    # puts formatted_entry
    if previous.blank? || previous == entry
    previous = entry
    # p "#{previous} || #{entry}"
    else
    # p "#{previous} || #{entry}"
    new_entry = entry.dup
    new_entry = "<b>#{formatted_entry}</b>"
    hash = {}
    desc_array.each do |y|
    if hash[y[0]]
    @@ -37,6 +49,7 @@
    new_entry << "; #{k} #{v}"
    end
    end
    # p new_entry
    new_file << new_entry + "\n"
    # p new_entry.encoding.name
    desc_array = []
  2. mikecarroll created this gist Aug 26, 2013.
    56 changes: 56 additions & 0 deletions uzbek.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,56 @@
    desc "Normalize Uzbek file."
    task :uzbek => :environment do
    counter = 0
    entry_counter = 0
    previous = nil
    desc_array = []
    new_file = File.open('~/fixed_UE.txt', 'w')

    CSV.foreach("~/UE.txt", { :col_sep => "|" }) do |row|
    if row.count > 2
    row[1] = row[1] + row[2]
    row.pop
    end
    unless row.empty?
    entry = row[0].strip
    pos = entry.match(/\w*\.$/).to_s.strip
    entry = entry.gsub(/\w*\.$/, '').strip
    desc_array << [pos, row[1].strip]
    if previous.blank? || previous == entry
    previous = entry
    # p "#{previous} || #{entry}"
    else
    # p "#{previous} || #{entry}"
    new_entry = entry.dup
    hash = {}
    desc_array.each do |y|
    if hash[y[0]]
    hash[y[0]] = hash[y[0]] << ", #{y[1]}"
    else
    hash[y[0]] = y[1]
    end
    end
    hash.each_with_index do |(k, v), i|
    if i == 0
    new_entry << " #{k} #{v}"
    else
    new_entry << "; #{k} #{v}"
    end
    end
    new_file << new_entry + "\n"
    # p new_entry.encoding.name
    desc_array = []
    previous = entry
    entry_counter += 1
    end
    counter += 1
    if counter%1000 == 0
    p counter
    end
    end
    end
    new_file.close

    p "Done! ENTRIES COUNT: #{entry_counter}"
    p "Done! TOTAL COUNT: #{counter}"
    end