#!/bin/bash url_prefix=https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual function download() { dataset_name=$1; shift splits=$1; shift last_split=$(perl -e "print ${splits} - 1") mkdir -p ${dataset_name} for i in $(seq 0 ${last_split}); do filename=${dataset_name}.tfrecord-$(printf %05d ${i})-of-$(printf %05d ${splits}).json.gz wget ${url_prefix}/${filename} -O ${dataset_name}/${filename} done } # Japanese portion of mC4 download c4-ja-Latn-validation 1 download c4-ja-Latn 8 download c4-ja-validation 8 download c4-ja 1024