Last active
January 2, 2025 18:50
-
-
Save platypii/83b832c963b7459af28adc7463ac89f2 to your computer and use it in GitHub Desktop.
Hyparquet Lambda function to generate parquet file metadata when a file is uploaded to S3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { S3Client, GetObjectCommand } from '@aws-sdk/client-s3' | |
| const s3 = new S3Client() | |
| import { parquetMetadata, toJson } from 'hyparquet' | |
| export async function handler(event) { | |
| console.log('Event received:', JSON.stringify(event, null, 2)) | |
| // The event structure can contain multiple records if multiple files are uploaded at once. | |
| for (const record of event.Records) { | |
| const bucket = record.s3.bucket.name | |
| const key = decodeURIComponent(record.s3.object.key.replace(/\+/g, ' ')) | |
| // We only want .parquet files | |
| if (!key.endsWith('.parquet')) { | |
| console.log(`Skipping non-parquet file: ${key}`) | |
| continue | |
| } | |
| try { | |
| // Get the parquet file from S3 | |
| const data = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: key })) | |
| const arrayBuffer = await streamToArrayBuffer(data.Body) | |
| // Extract metadata using hyparquet | |
| const metadata = parquetMetadata(arrayBuffer) | |
| // Log the metadata | |
| console.log(`Metadata for ${key}:`, JSON.stringify(toJson(metadata), null, 2)) | |
| // Here you could store the metadata to DynamoDB, S3, etc. | |
| // await storeMetadata(metadata) | |
| } catch (err) { | |
| console.error(`Error processing file ${key} from bucket ${bucket}:`, err) | |
| } | |
| } | |
| return { statusCode: 200, body: 'Metadata extraction complete' } | |
| } | |
| async function streamToArrayBuffer(stream) { | |
| const chunks = [] | |
| for await (const chunk of stream) { | |
| chunks.push(chunk) | |
| } | |
| const buffer = Buffer.concat(chunks) | |
| const array = Uint8Array.from(buffer) | |
| return array.buffer | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment