|
#!/usr/bin/env node |
|
/** |
|
* Smallest AI Pulse STT - Real-time Streaming Example |
|
* |
|
* This example demonstrates proper WebSocket streaming with: |
|
* - Controlled chunk rate (prevents latency issues) |
|
* - Full transcript support |
|
* - Multi-language support (Hindi, English, auto-detect) |
|
* - Proper finalize signal |
|
* |
|
* Usage: |
|
* node pulse_stt_streaming.js <audio_file> [language] |
|
* |
|
* Examples: |
|
* node pulse_stt_streaming.js recording.wav en |
|
* node pulse_stt_streaming.js hindi_audio.wav hi |
|
* node pulse_stt_streaming.js mixed_audio.wav multi |
|
* |
|
* Environment: |
|
* SMALLEST_API_KEY - Your Smallest AI API key |
|
*/ |
|
|
|
const WebSocket = require("ws"); |
|
const fs = require("fs"); |
|
const path = require("path"); |
|
|
|
// ============================================================================ |
|
// CONFIGURATION |
|
// ============================================================================ |
|
|
|
const CONFIG = { |
|
WS_URL: "wss://waves-api.smallest.ai/api/v1/pulse/get_text", |
|
ENCODING: "linear16", |
|
SAMPLE_RATE: 16000, |
|
BYTES_PER_SAMPLE: 2, // 16-bit = 2 bytes |
|
|
|
// Chunking settings (critical for latency) - can override via env vars |
|
CHUNK_SIZE: parseInt(process.env.CHUNK_SIZE) || 4096, |
|
CHUNK_INTERVAL_MS: parseInt(process.env.CHUNK_INTERVAL) || 100, |
|
|
|
// Feature flags |
|
FULL_TRANSCRIPT: true, // Enable cumulative transcript |
|
WORD_TIMESTAMPS: true, |
|
NUMERALS: "auto", |
|
}; |
|
|
|
// ============================================================================ |
|
// AUDIO UTILITIES |
|
// ============================================================================ |
|
|
|
/** |
|
* Read WAV file and extract raw PCM data |
|
* Handles basic WAV parsing - for production use a proper library like 'wav' |
|
*/ |
|
function readWavFile(filePath) { |
|
const buffer = fs.readFileSync(filePath); |
|
|
|
// Parse WAV header |
|
const riff = buffer.toString("ascii", 0, 4); |
|
if (riff !== "RIFF") { |
|
throw new Error("Not a valid WAV file"); |
|
} |
|
|
|
const format = buffer.toString("ascii", 8, 12); |
|
if (format !== "WAVE") { |
|
throw new Error("Not a valid WAV file"); |
|
} |
|
|
|
// Find data chunk |
|
let offset = 12; |
|
while (offset < buffer.length) { |
|
const chunkId = buffer.toString("ascii", offset, offset + 4); |
|
const chunkSize = buffer.readUInt32LE(offset + 4); |
|
|
|
if (chunkId === "fmt ") { |
|
const audioFormat = buffer.readUInt16LE(offset + 8); |
|
const numChannels = buffer.readUInt16LE(offset + 10); |
|
const sampleRate = buffer.readUInt32LE(offset + 12); |
|
const bitsPerSample = buffer.readUInt16LE(offset + 22); |
|
|
|
console.log(`[AUDIO] Format: ${audioFormat === 1 ? "PCM" : "Other"}`); |
|
console.log(`[AUDIO] Channels: ${numChannels}`); |
|
console.log(`[AUDIO] Sample Rate: ${sampleRate} Hz`); |
|
console.log(`[AUDIO] Bits per Sample: ${bitsPerSample}`); |
|
|
|
if (sampleRate !== CONFIG.SAMPLE_RATE) { |
|
console.warn(`[WARN] Sample rate mismatch! File: ${sampleRate}, Expected: ${CONFIG.SAMPLE_RATE}`); |
|
console.warn(`[WARN] Consider resampling with: ffmpeg -i input.wav -ar 16000 -ac 1 output.wav`); |
|
} |
|
} |
|
|
|
if (chunkId === "data") { |
|
return buffer.slice(offset + 8, offset + 8 + chunkSize); |
|
} |
|
|
|
offset += 8 + chunkSize; |
|
} |
|
|
|
throw new Error("No data chunk found in WAV file"); |
|
} |
|
|
|
/** |
|
* Convert audio file to 16kHz mono PCM using ffmpeg (if available) |
|
*/ |
|
function convertWithFfmpeg(inputPath, outputPath) { |
|
const { execSync } = require("child_process"); |
|
try { |
|
execSync(`ffmpeg -y -i "${inputPath}" -ar 16000 -ac 1 -f s16le "${outputPath}"`, { |
|
stdio: "pipe" |
|
}); |
|
return true; |
|
} catch (e) { |
|
return false; |
|
} |
|
} |
|
|
|
// ============================================================================ |
|
// PULSE STT CLIENT |
|
// ============================================================================ |
|
|
|
class PulseSTTClient { |
|
constructor(apiKey, options = {}) { |
|
this.apiKey = apiKey; |
|
this.options = { |
|
language: options.language || "en", |
|
fullTranscript: options.fullTranscript ?? CONFIG.FULL_TRANSCRIPT, |
|
wordTimestamps: options.wordTimestamps ?? CONFIG.WORD_TIMESTAMPS, |
|
numerals: options.numerals || CONFIG.NUMERALS, |
|
...options |
|
}; |
|
|
|
this.ws = null; |
|
this.audioBuffer = Buffer.alloc(0); |
|
this.sendInterval = null; |
|
this.isConnected = false; |
|
this.sessionId = null; |
|
|
|
// Stats |
|
this.stats = { |
|
chunksSent: 0, |
|
bytesSent: 0, |
|
messagesReceived: 0, |
|
partialCount: 0, |
|
finalCount: 0, |
|
startTime: null, |
|
}; |
|
} |
|
|
|
buildUrl() { |
|
const url = new URL(CONFIG.WS_URL); |
|
url.searchParams.set("language", this.options.language); |
|
url.searchParams.set("encoding", CONFIG.ENCODING); |
|
url.searchParams.set("sample_rate", CONFIG.SAMPLE_RATE.toString()); |
|
url.searchParams.set("full_transcript", this.options.fullTranscript.toString()); |
|
url.searchParams.set("word_timestamps", this.options.wordTimestamps.toString()); |
|
url.searchParams.set("numerals", this.options.numerals); |
|
|
|
// Optional features |
|
if (this.options.diarize) { |
|
url.searchParams.set("diarize", "true"); |
|
} |
|
if (this.options.redactPii) { |
|
url.searchParams.set("redact_pii", "true"); |
|
} |
|
|
|
return url.toString(); |
|
} |
|
|
|
connect() { |
|
return new Promise((resolve, reject) => { |
|
const url = this.buildUrl(); |
|
console.log(`[CONNECT] URL: ${url.replace(this.apiKey, "***")}`); |
|
console.log(`[CONNECT] Language: ${this.options.language}`); |
|
console.log(`[CONNECT] Full Transcript: ${this.options.fullTranscript}`); |
|
|
|
this.ws = new WebSocket(url, { |
|
headers: { |
|
"Authorization": `Bearer ${this.apiKey}` |
|
} |
|
}); |
|
|
|
this.ws.on("open", () => { |
|
console.log("[CONNECT] ✓ WebSocket connected"); |
|
this.isConnected = true; |
|
this.stats.startTime = Date.now(); |
|
resolve(); |
|
}); |
|
|
|
this.ws.on("error", (error) => { |
|
console.error("[ERROR] WebSocket error:", error.message); |
|
reject(error); |
|
}); |
|
|
|
this.ws.on("close", (code, reason) => { |
|
console.log(`[CLOSE] WebSocket closed: ${code} - ${reason || "No reason"}`); |
|
this.isConnected = false; |
|
this.stopSending(); |
|
}); |
|
|
|
this.ws.on("message", (data) => { |
|
this.handleMessage(data); |
|
}); |
|
}); |
|
} |
|
|
|
handleMessage(data) { |
|
const message = JSON.parse(data.toString()); |
|
this.stats.messagesReceived++; |
|
|
|
if (message.session_id && !this.sessionId) { |
|
this.sessionId = message.session_id; |
|
console.log(`[SESSION] ID: ${this.sessionId}`); |
|
} |
|
|
|
if (message.transcript) { |
|
const transcript = message.transcript.trim(); |
|
const timestamp = new Date().toISOString(); |
|
|
|
if (message.is_final) { |
|
this.stats.finalCount++; |
|
console.log(`\n[FINAL] ${transcript}`); |
|
|
|
if (message.full_transcript) { |
|
console.log(`[FULL] ${message.full_transcript}`); |
|
} |
|
|
|
if (message.language) { |
|
console.log(`[LANG] Detected: ${message.language}`); |
|
} |
|
} else { |
|
this.stats.partialCount++; |
|
process.stdout.write(`\r[PARTIAL] ${transcript} `); |
|
} |
|
} |
|
|
|
if (message.is_last) { |
|
console.log("\n[DONE] ✓ Transcription complete"); |
|
this.printStats(); |
|
} |
|
} |
|
|
|
/** |
|
* Queue audio data for sending |
|
* Audio is buffered and sent at controlled intervals |
|
*/ |
|
queueAudio(audioData) { |
|
this.audioBuffer = Buffer.concat([this.audioBuffer, audioData]); |
|
} |
|
|
|
/** |
|
* Start sending buffered audio at real-time pace |
|
*/ |
|
startSending() { |
|
if (this.sendInterval) return; |
|
|
|
console.log(`[STREAM] Starting audio stream (${CONFIG.CHUNK_SIZE} bytes every ${CONFIG.CHUNK_INTERVAL_MS}ms)`); |
|
|
|
this.sendInterval = setInterval(() => { |
|
if (this.audioBuffer.length >= CONFIG.CHUNK_SIZE && this.ws?.readyState === WebSocket.OPEN) { |
|
const chunk = this.audioBuffer.slice(0, CONFIG.CHUNK_SIZE); |
|
this.audioBuffer = this.audioBuffer.slice(CONFIG.CHUNK_SIZE); |
|
|
|
this.ws.send(chunk); |
|
this.stats.chunksSent++; |
|
this.stats.bytesSent += chunk.length; |
|
} |
|
}, CONFIG.CHUNK_INTERVAL_MS); |
|
} |
|
|
|
stopSending() { |
|
if (this.sendInterval) { |
|
clearInterval(this.sendInterval); |
|
this.sendInterval = null; |
|
} |
|
} |
|
|
|
/** |
|
* Send remaining audio and finalize |
|
*/ |
|
async finalize() { |
|
return new Promise((resolve) => { |
|
this.stopSending(); |
|
|
|
// Send any remaining audio |
|
if (this.audioBuffer.length > 0 && this.ws?.readyState === WebSocket.OPEN) { |
|
console.log(`[STREAM] Sending remaining ${this.audioBuffer.length} bytes`); |
|
this.ws.send(this.audioBuffer); |
|
this.stats.bytesSent += this.audioBuffer.length; |
|
this.audioBuffer = Buffer.alloc(0); |
|
} |
|
|
|
// Send finalize signal |
|
if (this.ws?.readyState === WebSocket.OPEN) { |
|
console.log("[STREAM] Sending finalize signal"); |
|
this.ws.send(JSON.stringify({ type: "finalize" })); |
|
} |
|
|
|
// Wait for is_last message |
|
const checkComplete = setInterval(() => { |
|
if (!this.isConnected) { |
|
clearInterval(checkComplete); |
|
resolve(); |
|
} |
|
}, 100); |
|
|
|
// Timeout after 30 seconds |
|
setTimeout(() => { |
|
clearInterval(checkComplete); |
|
resolve(); |
|
}, 30000); |
|
}); |
|
} |
|
|
|
printStats() { |
|
const duration = (Date.now() - this.stats.startTime) / 1000; |
|
console.log("\n" + "=".repeat(60)); |
|
console.log("SESSION STATS"); |
|
console.log("=".repeat(60)); |
|
console.log(`Duration: ${duration.toFixed(2)}s`); |
|
console.log(`Chunks sent: ${this.stats.chunksSent}`); |
|
console.log(`Bytes sent: ${this.stats.bytesSent}`); |
|
console.log(`Messages received: ${this.stats.messagesReceived}`); |
|
console.log(`Partial transcripts: ${this.stats.partialCount}`); |
|
console.log(`Final transcripts: ${this.stats.finalCount}`); |
|
console.log("=".repeat(60)); |
|
} |
|
|
|
close() { |
|
this.stopSending(); |
|
if (this.ws) { |
|
this.ws.close(1000, "Done"); |
|
} |
|
} |
|
} |
|
|
|
// ============================================================================ |
|
// MAIN |
|
// ============================================================================ |
|
|
|
async function main() { |
|
// Parse arguments |
|
const args = process.argv.slice(2); |
|
|
|
if (args.length < 1) { |
|
console.log("Usage: node pulse_stt_streaming.js <audio_file> [language]"); |
|
console.log(""); |
|
console.log("Languages:"); |
|
console.log(" en - English (default)"); |
|
console.log(" hi - Hindi"); |
|
console.log(" multi - Auto-detect language"); |
|
console.log(" (see docs for full list)"); |
|
console.log(""); |
|
console.log("Examples:"); |
|
console.log(" node pulse_stt_streaming.js recording.wav"); |
|
console.log(" node pulse_stt_streaming.js hindi_audio.wav hi"); |
|
console.log(" node pulse_stt_streaming.js mixed.wav multi"); |
|
process.exit(1); |
|
} |
|
|
|
const audioFile = args[0]; |
|
const language = args[1] || "en"; |
|
|
|
// Check API key |
|
const apiKey = process.env.SMALLEST_API_KEY; |
|
if (!apiKey) { |
|
console.error("Error: SMALLEST_API_KEY environment variable not set"); |
|
console.error("Get your key at: https://smallest.ai/console"); |
|
process.exit(1); |
|
} |
|
|
|
// Check file exists |
|
if (!fs.existsSync(audioFile)) { |
|
console.error(`Error: File not found: ${audioFile}`); |
|
process.exit(1); |
|
} |
|
|
|
console.log("=".repeat(60)); |
|
console.log("SMALLEST AI PULSE STT - STREAMING EXAMPLE"); |
|
console.log("=".repeat(60)); |
|
console.log(`File: ${audioFile}`); |
|
console.log(`Language: ${language}`); |
|
console.log(""); |
|
|
|
// Load audio |
|
let audioData; |
|
const ext = path.extname(audioFile).toLowerCase(); |
|
|
|
if (ext === ".wav") { |
|
console.log("[LOAD] Reading WAV file..."); |
|
audioData = readWavFile(audioFile); |
|
} else if (ext === ".pcm" || ext === ".raw") { |
|
console.log("[LOAD] Reading raw PCM file..."); |
|
audioData = fs.readFileSync(audioFile); |
|
} else { |
|
// Try ffmpeg conversion |
|
console.log(`[LOAD] Converting ${ext} to PCM with ffmpeg...`); |
|
const tempFile = `/tmp/pulse_stt_temp_${Date.now()}.pcm`; |
|
if (convertWithFfmpeg(audioFile, tempFile)) { |
|
audioData = fs.readFileSync(tempFile); |
|
fs.unlinkSync(tempFile); |
|
} else { |
|
console.error(`Error: Unsupported format ${ext}. Convert to WAV first:`); |
|
console.error(` ffmpeg -i ${audioFile} -ar 16000 -ac 1 output.wav`); |
|
process.exit(1); |
|
} |
|
} |
|
|
|
console.log(`[LOAD] Audio size: ${audioData.length} bytes`); |
|
console.log(`[LOAD] Duration: ~${(audioData.length / CONFIG.SAMPLE_RATE / CONFIG.BYTES_PER_SAMPLE).toFixed(2)}s`); |
|
console.log(""); |
|
|
|
// Create client and stream |
|
const client = new PulseSTTClient(apiKey, { |
|
language: language, |
|
fullTranscript: true, |
|
wordTimestamps: true, |
|
}); |
|
|
|
try { |
|
await client.connect(); |
|
|
|
// Queue all audio |
|
client.queueAudio(audioData); |
|
|
|
// Start streaming at controlled rate |
|
client.startSending(); |
|
|
|
// Wait for audio to be sent (approximate) |
|
const expectedDuration = (audioData.length / CONFIG.CHUNK_SIZE) * CONFIG.CHUNK_INTERVAL_MS; |
|
console.log(`[STREAM] Expected stream duration: ${(expectedDuration / 1000).toFixed(2)}s`); |
|
|
|
// Wait and finalize |
|
await new Promise(resolve => setTimeout(resolve, expectedDuration + 1000)); |
|
await client.finalize(); |
|
|
|
} catch (error) { |
|
console.error("Error:", error.message); |
|
} finally { |
|
client.close(); |
|
} |
|
} |
|
|
|
main().catch(console.error); |