Skip to content

Instantly share code, notes, and snippets.

@chrisluedtke
Last active February 1, 2019 17:58
Show Gist options
  • Select an option

  • Save chrisluedtke/c6cec00a3a985bc6af9aef490dab54f5 to your computer and use it in GitHub Desktop.

Select an option

Save chrisluedtke/c6cec00a3a985bc6af9aef490dab54f5 to your computer and use it in GitHub Desktop.
Get UCI data urls from base url
import re
import requests
from typing import Tuple
from bs4 import BeautifulSoup
def get_uci_data_urls(url: str) -> Tuple[str]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
match = soup.find('a', string='Data Folder', href=True)
if not match:
return '', ''
db_url = match['href']
url = url.split('/')[:-2]
db_url = '/'.join(url) + db_url[2:]
r = requests.get(db_url)
soup = BeautifulSoup(r.text, 'html.parser')
data_url = soup.find(string=re.compile(".data"), href=True)
if data_url:
data_url = db_url + data_url['href']
names_url = soup.find(string=re.compile(".names"), href=True)
if names_url:
names_url = db_url + names_url['href']
return data_url or '', names_url or ''
def get_attributes(names_url: str):
r = requests.get(names_url)
attr_found, blank_count = [False] * 2
attrs = {}
for i, line in enumerate(r.text.split('\n')):
if 'Attribute Information' in line:
attr_found = True
continue
if attr_found and not line:
blank_count += 1
if blank_count==2:
break
continue
if attr_found:
attr = (line.replace(' ', '')
.replace('\t', '')
.replace('.', '').split(':'))
attrs[attr[0]] = attr[1]
return attrs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment