Skip to content

Instantly share code, notes, and snippets.

@mikecarroll
Last active December 21, 2015 17:49
Show Gist options
  • Select an option

  • Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.

Select an option

Save mikecarroll/6342821 to your computer and use it in GitHub Desktop.
desc "Normalize Uzbek file."
task :uzbek => :environment do
counter = 0
entry_counter = 0
previous = nil
desc_array = []
new_file = File.open('~/fixed_UE.txt', 'w')
CSV.foreach("~/UE.txt", { :col_sep => "|" }) do |row|
if row.count > 2
row[1] = row[1] + row[2]
row.pop
end
unless row.empty?
entry = row[0].strip
pos = entry.match(/\w*\.$/).to_s.strip
entry = entry.gsub(/\w*\.$/, '').strip
desc_array << [pos, row[1].strip]
if previous.blank? || previous == entry
previous = entry
# p "#{previous} || #{entry}"
else
# p "#{previous} || #{entry}"
new_entry = entry.dup
hash = {}
desc_array.each do |y|
if hash[y[0]]
hash[y[0]] = hash[y[0]] << ", #{y[1]}"
else
hash[y[0]] = y[1]
end
end
hash.each_with_index do |(k, v), i|
if i == 0
new_entry << " #{k} #{v}"
else
new_entry << "; #{k} #{v}"
end
end
new_file << new_entry + "\n"
# p new_entry.encoding.name
desc_array = []
previous = entry
entry_counter += 1
end
counter += 1
if counter%1000 == 0
p counter
end
end
end
new_file.close
p "Done! ENTRIES COUNT: #{entry_counter}"
p "Done! TOTAL COUNT: #{counter}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment