Last active
May 19, 2025 13:10
-
-
Save elston/b54899209f4a74fedfb550ebdfb39205 to your computer and use it in GitHub Desktop.
HuBERT-ECG demo with ptb-xl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "610604c2-1874-42ba-85f5-5246087dc36b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "from utils import dataset_processing" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "cf2458f9-b3eb-4b83-a4a4-96f9bb47fd57", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "path = '/data/ptb-xl'\n", | |
| "path_norm = '/shared/ptb-xl'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "9d58acd2-2aa4-4812-9f2d-51f847e0dea2", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "Y = pd.read_csv(f'{path}/ptbxl_database.csv', index_col='ecg_id')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "2e386166-bd8d-48cb-8eb6-98e4d961a062", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "filenames = Y.filename_hr.to_numpy()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "dbd8d470-4b6b-4d09-9b99-0b1acfe8f0e8", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(21799,)" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "filenames.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "d9f419cd-e4cb-4df2-83fd-31388816a943", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array(['records500/00000/00001_hr', 'records500/00000/00002_hr',\n", | |
| " 'records500/00000/00003_hr', ..., 'records500/21000/21835_hr',\n", | |
| " 'records500/21000/21836_hr', 'records500/21000/21837_hr'],\n", | |
| " dtype=object)" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "filenames" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "70bb277d-414c-4fb8-b63d-488f8fdb729d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Processing ECG files: 100%|██████████| 21799/21799 [01:34<00:00, 229.62it/s]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "dataset_processing(filenames, path, path_norm)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.8" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import numpy as np | |
| # from sklearn.utils import resample | |
| from scipy.signal import resample | |
| from biosppy.signals.tools import filter_signal | |
| from tqdm import tqdm | |
| import wfdb | |
| def apply_filter( | |
| signal, | |
| filter_bandwidth, | |
| fs=500 | |
| ): | |
| order = int(0.3 * fs) | |
| signal, _, _ = filter_signal( | |
| signal=signal, | |
| ftype='FIR', | |
| band='bandpass', | |
| order=order, | |
| frequency=filter_bandwidth, | |
| sampling_rate=fs | |
| ) | |
| return signal | |
| def scaling( | |
| ecg_signal, | |
| smooth=1e-8 | |
| ): | |
| return 2 * ( | |
| ecg_signal - np.min(ecg_signal, axis=1)[None].T | |
| ) / ( | |
| np.max(ecg_signal, axis=1) - np.min(ecg_signal, axis=1) + smooth | |
| )[None].T - 1 | |
| def ecg_preprocessing( | |
| ecg_signal, | |
| original_frequency, | |
| # target_frequency=100, | |
| band_pass=[0.05, 47] | |
| )-> np.ndarray: | |
| assert ecg_signal.shape[0] == 12, "ecg_signal should have (12, signal_length) shape for pre-processing" | |
| num_samples = int(ecg_signal.shape[-1] * (500 / original_frequency)) | |
| ecg_signal = resample(ecg_signal, num_samples, axis=1) | |
| ecg_signal = apply_filter(ecg_signal, band_pass) | |
| return scaling(ecg_signal) | |
| def dataset_processing( | |
| filenames: list[str], | |
| path_wfdb: str, | |
| path_norm: str, | |
| skip_existing=True | |
| ): | |
| os.makedirs(path_norm, exist_ok=True) | |
| for filename in tqdm(filenames, desc='Processing ECG files'): | |
| output_filename = f"HR{os.path.basename(filename).replace('_hr', '.hea.npy')}" | |
| output_path = os.path.join(path_norm, output_filename) | |
| if skip_existing and os.path.exists(output_path): | |
| continue | |
| try: | |
| signal, meta = wfdb.rdsamp(os.path.join(path_wfdb, filename)) | |
| signal = signal.T | |
| if np.isnan(signal).any(): | |
| signal = np.nan_to_num(signal, nan=0.0) | |
| signal_norm = ecg_preprocessing(signal, meta['fs']) | |
| np.save(output_path, signal_norm) | |
| except Exception as e: | |
| print(f"Error processing {filename}: {str(e)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment