Skip to content

Instantly share code, notes, and snippets.

@karkraeg
Created June 20, 2024 09:23
Show Gist options
  • Select an option

  • Save karkraeg/5501b6828e32fbd13891a50fb259383b to your computer and use it in GitHub Desktop.

Select an option

Save karkraeg/5501b6828e32fbd13891a50fb259383b to your computer and use it in GitHub Desktop.
{
"aar": "aa",
"abk": "ab",
"afr": "af",
"aka": "ak",
"alb": "sq",
"amh": "am",
"ara": "ar",
"arg": "an",
"arm": "hy",
"asm": "as",
"ava": "av",
"ave": "ae",
"aym": "ay",
"aze": "az",
"bak": "ba",
"bam": "bm",
"baq": "eu",
"bel": "be",
"ben": "bn",
"bih": "bh",
"bis": "bi",
"bos": "bs",
"bre": "br",
"bul": "bg",
"bur": "my",
"cat": "ca",
"cha": "ch",
"che": "ce",
"chi": "zh",
"chu": "cu",
"chv": "cv",
"cor": "kw",
"cos": "co",
"cre": "cr",
"cze": "cs",
"dan": "da",
"div": "dv",
"dut": "nl",
"dzo": "dz",
"eng": "en",
"epo": "eo",
"est": "et",
"ewe": "ee",
"fao": "fo",
"fij": "fj",
"fin": "fi",
"fre": "fr",
"fry": "fy",
"ful": "ff",
"geo": "ka",
"ger": "de",
"gla": "gd",
"gle": "ga",
"glg": "gl",
"glv": "gv",
"gre": "el",
"grn": "gn",
"guj": "gu",
"hat": "ht",
"hau": "ha",
"heb": "he",
"her": "hz",
"hin": "hi",
"hmo": "ho",
"hrv": "hr",
"hun": "hu",
"ibo": "ig",
"ice": "is",
"ido": "io",
"iii": "ii",
"iku": "iu",
"ile": "ie",
"ina": "ia",
"ind": "id",
"ipk": "ik",
"ita": "it",
"jav": "jv",
"jpn": "ja",
"kal": "kl",
"kan": "kn",
"kas": "ks",
"kau": "kr",
"kaz": "kk",
"khm": "km",
"kik": "ki",
"kin": "rw",
"kir": "ky",
"kom": "kv",
"kon": "kg",
"kor": "ko",
"kua": "kj",
"kur": "ku",
"lao": "lo",
"lat": "la",
"lav": "lv",
"lim": "li",
"lin": "ln",
"lit": "lt",
"ltz": "lb",
"lub": "lu",
"lug": "lg",
"mac": "mk",
"mah": "mh",
"mal": "ml",
"mao": "mi",
"mar": "mr",
"may": "ms",
"mlg": "mg",
"mlt": "mt",
"mon": "mn",
"nau": "na",
"nav": "nv",
"nbl": "nr",
"nde": "nd",
"ndo": "ng",
"nep": "ne",
"nno": "nn",
"nob": "nb",
"nor": "no",
"nya": "ny",
"oci": "oc",
"oji": "oj",
"ori": "or",
"orm": "om",
"oss": "os",
"pan": "pa",
"per": "fa",
"pli": "pi",
"pol": "pl",
"por": "pt",
"pus": "ps",
"que": "qu",
"roh": "rm",
"rum": "ro",
"run": "rn",
"rus": "ru",
"sag": "sg",
"san": "sa",
"sin": "si",
"slo": "sk",
"slv": "sl",
"sme": "se",
"smo": "sm",
"sna": "sn",
"snd": "sd",
"som": "so",
"sot": "st",
"spa": "es",
"srd": "sc",
"srp": "sr",
"ssw": "ss",
"sun": "su",
"swa": "sw",
"swe": "sv",
"tah": "ty",
"tam": "ta",
"tat": "tt",
"tel": "te",
"tgk": "tg",
"tgl": "tl",
"tha": "th",
"tib": "bo",
"tir": "ti",
"ton": "to",
"tsn": "tn",
"tso": "ts",
"tuk": "tk",
"tur": "tr",
"twi": "tw",
"uig": "ug",
"ukr": "uk",
"urd": "ur",
"uzb": "uz",
"ven": "ve",
"vie": "vi",
"vol": "vo",
"wel": "cy",
"wln": "wa",
"wol": "wo",
"xho": "xh",
"yid": "yi",
"yor": "yo",
"zha": "za",
"zul": "zu",
}
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint
url = "https://www.loc.gov/standards/iso639-2/php/code_list.php"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
iso_codes = {}
for table in soup.find_all("table"):
tableheader = table.find_all("th")
if tableheader and tableheader[0].text.strip() == "ISO 639-2 Code":
for row in table.find_all("tr")[1:]:
cols = row.find_all("td")
try:
cols[1]
except IndexError:
continue
else:
iso_639_1_code = cols[1].text
if "\xa0" in iso_639_1_code:
continue
else:
iso_639_2_code = cols[0].text
if "(B)" in iso_639_2_code:
# multiple codes found, use bibliographic code
iso_639_2_code = re.sub(
r"(.+?)\s\(B\).+", r"\1", iso_639_2_code
)
iso_codes[iso_639_2_code] = iso_639_1_code
break
pprint(iso_codes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment