Created
March 17, 2023 16:00
-
-
Save 2minchul/6ebd790f20cca7b48d1c3d3b71ac051d to your computer and use it in GitHub Desktop.
Revisions
-
2minchul created this gist
Mar 17, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,210 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Optional, List, Tuple, Dict import requests from bs4 import BeautifulSoup def search_many_postcode(queries: List[Tuple[str, str]]) -> Dict[tuple, Optional[dict]]: """ :param queries: list of tuple(keyword, zipcode) :return: {tuple(keyword, zipcode): dict of address} """ result = {} with ThreadPoolExecutor(max_workers=10) as pool: with requests.Session() as session: # Run `search_postcode(pair[0], pair[1], session)` in thread futures = {pool.submit(search_postcode, pair[0], pair[1], session): pair for pair in queries} for future in as_completed(futures): pair = futures[future] result[pair] = future.result() return result def search_postcode(keyword: str, zipcode: str, session: Optional[requests.Session] = None) -> Optional[dict]: html = request_search_postcode(keyword, session) for data in parse_iter_postcode(html, keyword): if data.get('zonecode') == zipcode: return data def request_search_postcode(keyword: str, session: Optional[requests.Session] = None) -> str: headers = { 'authority': 'postcode.map.daum.net', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'referer': 'https://postcode.map.daum.net', 'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'iframe', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } url = 'https://postcode.map.daum.net/search?' \ f'region_name={keyword}&cq={keyword}&cpage=1&origin=https://postcode.map.daum.net&' \ f'isp=N&isgr=N&isgj=N&ongr=&ongj=®ionid=®ionname=&roadcode=&roadname=&banner=on&' \ f'ubl=on&indaum=off&vt=popup&amr=on&amj=on&ani=off&mode=transmit&sd=on&fi=on&fc=on&hmb=off&' \ f'heb=off&asea=off&smh=off&zo=on&theme=&bit=&sit=&sgit=&sbit=&pit=&mit=&lcit=&plrg=&plrgt=1.5&' \ f'us=on&msi=10&ahs=off&whas=500&zn=Y&sm=on&CWinWidth=500&sptype=&sporgq=&a51=off' response: requests.Response if session: response = session.get(url, headers=headers) else: response = requests.get(url, headers=headers) with response: response.raise_for_status() return response.text def _get_data_attrs(tag): return {k[5:]: v for k, v in tag.attrs.items() if k.startswith('data-')} def parse_iter_postcode(html, keyword=''): soup = BeautifulSoup(html, 'html.parser') ul = soup.find('ul', attrs={'class': 'list_post'}) if not ul: return for li_tag in ul.find_all('li'): searched = _get_data_attrs(li_tag) address_dl = li_tag.find('dl', attrs={'class': 'list_address'}) if not address_dl: continue road = {} jibun = {} road_dd = address_dl.find('dd', attrs={'class': 'main_road'}) or address_dl.find( 'dd', attrs={'class': 'rel_road'}) if road_dd: span = road_dd.find('span', attrs={'class': 'txt_address'}) road = _get_data_attrs(span) jibun_dd = address_dl.find('dd', attrs={'class': 'main_jibun'}) or address_dl.find( 'dd', attrs={'class': 'rel_jibun'}) if jibun_dd: span = jibun_dd.find('span', attrs={'class': 'txt_address'}) jibun = _get_data_attrs(span) data = { 'query': keyword, '_from': 'html', 'addressType': searched.get('addr_type', ''), 'userSelectedType': searched.get('addr_type', ''), 'address': searched.get('addr', ''), 'addressEnglish': searched.get('addr_eng', ''), 'bcode': searched.get('bcode', ''), 'bname': searched.get('bname', ''), 'bnameEnglish': searched.get('bname_eng', ''), 'bname1': searched.get('bname1', ''), 'bname1English': searched.get('bname1_eng', ''), 'bname2': searched.get('bname2', ''), 'bname2English': searched.get('bname2_eng', ''), 'buildingCode': searched.get('building_code', '') or road.get('building_code', '') or jibun.get( 'building_code', ''), 'buildingName': searched.get('building_name', '') or road.get('building_name', '') or jibun.get( 'building_name', ''), 'hname': searched.get('hname', ''), 'apartment': searched.get('is_multi_building', '') == 'true', 'roadname': searched.get('roadname', '') or road.get('roadname', ''), 'roadnameCode': searched.get('roadname_code', '') or road.get('roadname_code', ''), 'roadnameEnglish': searched.get('roadname_eng', '') or road.get('roadname_eng', ''), 'sido': searched.get('sido', ''), 'sidoEnglish': searched.get('sido_eng', ''), 'sigungu': searched.get('sigungu', ''), 'sigunguCode': searched.get('sigungu_code', ''), 'sigunguEnglish': searched.get('sigungu_eng', ''), 'zonecode': searched.get('zonecode', ''), 'jibunAddress': jibun.get('addr', ''), 'jibunAddressEnglish': jibun.get('addr_eng', ''), 'roadAddress': road.get('addr', ''), 'roadAddressEnglish': road.get('addr_eng', ''), } yield data if __name__ == '__main__': from pprint import pprint result = search_many_postcode([ ('서울 강남구 가로수길 5', '06035'), ('서울 동작구 동작동 316', '06905'), ('없는주소', '12345'), ]) pprint(result) """output {('서울 강남구 가로수길 5', '06035'): {'_from': 'html', 'address': '서울 강남구 가로수길 5', 'addressEnglish': '5, Garosu-gil, Gangnam-gu, ' 'Seoul, Korea', 'addressType': 'R', 'apartment': False, 'bcode': '1168010700', 'bname': '신사동', 'bname1': '', 'bname1English': '', 'bname2': '신사동', 'bname2English': 'Sinsa-dong', 'bnameEnglish': 'Sinsa-dong', 'buildingCode': '1168010700105370005011918', 'buildingName': '', 'hname': '', 'jibunAddress': '서울 강남구 신사동 537-5', 'jibunAddressEnglish': '537-5, Sinsa-dong, ' 'Gangnam-gu, Seoul, Korea', 'query': '서울 강남구 가로수길 5', 'roadAddress': '서울 강남구 가로수길 5', 'roadAddressEnglish': '5, Garosu-gil, ' 'Gangnam-gu, Seoul, Korea', 'roadname': '가로수길', 'roadnameCode': '4858362', 'roadnameEnglish': 'Garosu-gil', 'sido': '서울', 'sidoEnglish': 'Seoul', 'sigungu': '강남구', 'sigunguCode': '11680', 'sigunguEnglish': 'Gangnam-gu', 'userSelectedType': 'R', 'zonecode': '06035'}, ('서울 동작구 동작동 316', '06905'): {'_from': 'html', 'address': '서울 동작구 동작동 316', 'addressEnglish': '316, Dongjak-dong, ' 'Dongjak-gu, Seoul, Korea', 'addressType': 'J', 'apartment': False, 'bcode': '1159010600', 'bname': '동작동', 'bname1': '', 'bname1English': '', 'bname2': '동작동', 'bname2English': 'Dongjak-dong', 'bnameEnglish': 'Dongjak-dong', 'buildingCode': '1159010600103160000000001', 'buildingName': '반포수난구조대', 'hname': '사당2동', 'jibunAddress': '서울 동작구 동작동 316', 'jibunAddressEnglish': '316, Dongjak-dong, ' 'Dongjak-gu, Seoul, ' 'Korea', 'query': '서울 동작구 동작동 316', 'roadAddress': '서울 동작구 동작대로 335-1', 'roadAddressEnglish': '335-1, Dongjak-daero, ' 'Dongjak-gu, Seoul, Korea', 'roadname': '동작대로', 'roadnameCode': '2005009', 'roadnameEnglish': 'Dongjak-daero', 'sido': '서울', 'sidoEnglish': 'Seoul', 'sigungu': '동작구', 'sigunguCode': '11590', 'sigunguEnglish': 'Dongjak-gu', 'userSelectedType': 'J', 'zonecode': '06905'}, ('없는주소', '12345'): None} """