Skip to content

Instantly share code, notes, and snippets.

@twslankard
Created April 2, 2013 20:53
Show Gist options
  • Select an option

  • Save twslankard/5296081 to your computer and use it in GitHub Desktop.

Select an option

Save twslankard/5296081 to your computer and use it in GitHub Desktop.

Revisions

  1. Tom Slankard created this gist Apr 2, 2013.
    95 changes: 95 additions & 0 deletions mpu.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,95 @@
    #!/usr/bin/env ruby

    require 'rubygems'
    require 'aws-sdk'

    def readPart(file_name, file_size, part_size, part)
    file_offset = part_size * ( part - 1 )
    bytes_to_read = [ part_size, file_size - file_offset ].min
    part_contents = File.read( file_name, bytes_to_read, file_offset )
    part_md5 = Digest::MD5.hexdigest( part_contents )
    return part_contents, part_md5
    end

    def uploadPart(part_contents, part_md5, part, upload)
    result_obj = upload.add_part( part_contents, :part_number => part )
    etag_md5sum = result_obj.etag[ 1..-2 ] # strip off the quotes ... not sure why they decided to include those
    if etag_md5sum != part_md5
    raise "part_md5 mismatch!"
    end
    end

    def shouldUploadPart(part, part_md5, upload)
    begin
    if upload.parts[part].etag[1..-2] == part_md5
    return false
    end
    rescue
    return true
    end
    return true
    end

    def uploadPartWithRetry(part_contents, part_md5, part, upload, tries)
    the_exception = nil
    for try in 1..tries
    begin
    uploadPart(part_contents, part_md5, part, upload)
    return
    rescue => e
    the_exception = e
    end
    end
    raise "too many retries, error: " + the_exception.message
    end

    def uploadParts(file_name, file_size, part_size, number_of_parts, upload)
    skipped_parts = 0
    start_time = Time.now
    for part in 1..number_of_parts
    part_contents, part_md5 = readPart( file_name, file_size, part_size, part )
    if( shouldUploadPart( part, part_md5, upload ) )
    uploadPartWithRetry(part_contents, part_md5, part, upload, 3)
    elapsed_time = Time.now - start_time
    average_part_time = elapsed_time / (part - skipped_parts)
    estimated_time_remaining = (average_part_time * (number_of_parts - part) / 60.0).ceil
    puts "Uploaded #{part}/#{number_of_parts} parts. Estimated time remaining: #{estimated_time_remaining} minutes"
    else
    skipped_parts += 1
    puts "Skipping already uploaded part #{part}/#{number_of_parts}."
    end
    end
    upload.complete(:remote_parts)
    end


    file_name = ARGV[0]
    bucket_name = ARGV[1]
    key_name = ARGV[2]


    s3 = AWS::S3.new
    bucket = s3.buckets[bucket_name]
    object = bucket.objects[key_name]

    number_of_multipart_uploads = 0
    object.multipart_uploads.each do |upload|
    number_of_multipart_uploads += 1
    end

    upload = nil
    if number_of_multipart_uploads > 1
    raise "multiple uploads in progress" # haven't decided what to do here yet
    elsif number_of_multipart_uploads == 1
    upload = object.multipart_uploads.first
    else
    upload = object.multipart_upload
    end

    file_size = File.size(file_name)

    part_size = [ (file_size / 10000.0).ceil, 5*1024*1024 ].max
    number_of_parts = (file_size / part_size.to_f).ceil
    puts "Uploading #{file_size} byte file as #{number_of_parts} chunks each up to #{part_size} bytes."
    uploadParts(file_name, file_size, part_size, number_of_parts, upload)
    puts "Done!"