Skip to content

Instantly share code, notes, and snippets.

@Thomascountz
Last active July 25, 2025 12:36
Show Gist options
  • Select an option

  • Save Thomascountz/b84b68f0a7c6f2f851ebc5db152b676a to your computer and use it in GitHub Desktop.

Select an option

Save Thomascountz/b84b68f0a7c6f2f851ebc5db152b676a to your computer and use it in GitHub Desktop.
Extract transcript text from Apple Memos M4A file
require "json"
class AppleMemosTranscriptionExtractor
class TsrpAtomNotFoundError < StandardError; end
class TranscriptDataInvalidError < StandardError; end
class MalformedAtomError < StandardError; end
RECORDINGS_PATH = "#{Dir.home}/Library/Group Containers/group.com.apple.VoiceMemos.shared/Recordings".freeze
COMPACT_HEADER_SIZE = 8
EXTENDED_HEADER_SIZE = 16
PATH_TO_TSRP = %w[moov trak mdia udta].freeze
TSRP_ATOM_TYPE = "tsrp".freeze
def extract_transcript(file_path)
tsrp_payload = find_tsrp_atom_payload_in_file(file_path)
if tsrp_payload.nil?
raise TsrpAtomNotFoundError, "Could not find 'tsrp' atom in #{file_path}"
end
parse_payload_and_extract_text(tsrp_payload, file_path)
end
private
def find_tsrp_atom_payload_in_file(file_path)
File.open(file_path, "rb") do |io|
search_for_tsrp_recursively(io, io.size)
end
end
def search_for_tsrp_recursively(io, search_end_offset)
while io.pos < search_end_offset
atom_start = io.pos
atom_header = read_atom_header(io, search_end_offset)
return nil unless atom_header
atom_size, atom_type, header_size = atom_header
io.seek(atom_start + header_size)
atom_end = [atom_start + atom_size, search_end_offset].min
payload_size = atom_end - io.pos
if payload_size < 0
raise MalformedAtomError, "Negative payload size for atom '#{atom_type}' at offset #{atom_start}"
end
if atom_type == TSRP_ATOM_TYPE
return io.read(payload_size)
elsif PATH_TO_TSRP.include?(atom_type)
found_payload = search_for_tsrp_recursively(io, atom_end)
return found_payload if found_payload
io.seek(atom_end)
else
io.seek(atom_end)
end
end
nil
end
# See: https://developer.apple.com/documentation/quicktime-file-format/atoms#Atom-structure
def read_atom_header(io, search_end_offset)
atom_start = io.pos
initial_header = io.read(COMPACT_HEADER_SIZE)
return nil if initial_header.nil? || initial_header.length < COMPACT_HEADER_SIZE
size_field = initial_header[0, 4].unpack1("N") # 32-bit unsigned integer, network (big-endian)
atom_type = initial_header[4, 4]
if size_field == 1 # Extended size: 64-bit unsigned integer, big-endian
extended_size_bytes = io.read(8)
atom_size = extended_size_bytes.unpack1("Q>")
header_size = EXTENDED_HEADER_SIZE
elsif size_field == 0 # Atom extends to end of search area
atom_size = search_end_offset - atom_start
header_size = COMPACT_HEADER_SIZE
else # Compact atom header (standard)
atom_size = size_field
header_size = COMPACT_HEADER_SIZE
end
if atom_size < header_size && atom_size != 0
raise MalformedAtomError, "Malformed atom: type '#{atom_type}', size #{atom_size} < header_size #{header_size} at offset #{atom_start}"
end
[atom_size, atom_type, header_size]
end
def parse_payload_and_extract_text(payload_string, file_path)
begin
parsed_json = JSON.parse(payload_string.force_encoding("UTF-8").scrub)
rescue JSON::ParserError => e
raise TranscriptDataInvalidError, "Invalid JSON in transcript data: #{e.message}"
end
if !parsed_json.is_a?(Hash)
raise TranscriptDataInvalidError, "Transcript data root is not a JSON object"
end
runs_data = parsed_json.dig("attributedString", "runs")
if !runs_data.is_a?(Array)
raise TranscriptDataInvalidError, "Expected 'attributedString.runs' to be an array"
end
runs_data.each_slice(2).map { |text_segment, _| text_segment }.join("")
end
end
if __FILE__ == $PROGRAM_NAME
if ARGV.empty?
warn "Usage: ruby #{$PROGRAM_NAME} <path_to_m4a_file>"
warn "Example: ruby #{$PROGRAM_NAME} \"#{AppleMemosTranscriptionExtractor::RECORDINGS_PATH}/My Recording.m4a\""
exit 1
end
file_path = ARGV[0]
extractor = AppleMemosTranscriptionExtractor.new
begin
transcript = extractor.extract_transcript(file_path)
puts transcript
rescue AppleMemosTranscriptionExtractor::TsrpAtomNotFoundError => e
warn "Error: #{e.message}"
exit 2
rescue AppleMemosTranscriptionExtractor::TranscriptDataInvalidError, AppleMemosTranscriptionExtractor::MalformedAtomError => e
warn "Error: #{e.message}"
exit 3
rescue Errno::ENOENT
warn "Error: File not found at '#{file_path}'"
exit 4
rescue Errno::EACCES
warn "Error: Permission denied for file '#{file_path}'"
exit 5
rescue => e
warn "An unexpected error occurred: #{e.class} - #{e.message}"
warn "Backtrace (run with DEBUG=1 for more details):"
warn e.backtrace.first(5).join("\n")
if ENV["DEBUG"]
warn "\nFull Backtrace:\n#{e.backtrace.join("\n")}"
end
exit 1
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment