chrisluedtke · February 1, 2019 17:58
diff --git a/get_uci_data_urls.py b/get_uci_data_urls.py
 import re
 import requests
 from typing import Tuple

 from bs4 import BeautifulSoup

 def get_uci_data_urls(url: str) -> Tuple[str]:
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'html.parser')

  match = soup.find('a', string='Data Folder', href=True)
  
  if not match:
    return '', ''

  db_url = match['href']
  url = url.split('/')[:-2]
  db_url = '/'.join(url) + db_url[2:]

  r = requests.get(db_url)
  soup = BeautifulSoup(r.text, 'html.parser')

  data_url = soup.find(string=re.compile(".data"), href=True)
  if data_url:
    data_url = db_url + data_url['href']

  names_url = soup.find(string=re.compile(".names"), href=True)
  if names_url:
    names_url = db_url + names_url['href']

  return data_url or '', names_url or ''

 def get_attributes(names_url: str):
    r = requests.get(names_url)
    
    attr_found, blank_count = [False] * 2
    attrs = {}
    for i, line in enumerate(r.text.split('\n')):
        if 'Attribute Information' in line:
            attr_found = True
            continue
        if attr_found and not line:
            blank_count += 1
            if blank_count==2:
                break
            continue
        if attr_found:
            attr = (line.replace('  ', '')
                        .replace('\t', '')
                        .replace('.', '').split(':'))
            attrs[attr[0]] = attr[1]

    return attrs
	import re
	import requests
	from typing import Tuple

	from bs4 import BeautifulSoup

	def get_uci_data_urls(url: str) -> Tuple[str]:
	r = requests.get(url)
	soup = BeautifulSoup(r.text, 'html.parser')

	match = soup.find('a', string='Data Folder', href=True)

	if not match:
	return '', ''

	db_url = match['href']
	url = url.split('/')[:-2]
	db_url = '/'.join(url) + db_url[2:]

	r = requests.get(db_url)
	soup = BeautifulSoup(r.text, 'html.parser')

	data_url = soup.find(string=re.compile(".data"), href=True)
	if data_url:
	data_url = db_url + data_url['href']

	names_url = soup.find(string=re.compile(".names"), href=True)
	if names_url:
	names_url = db_url + names_url['href']

	return data_url or '', names_url or ''

	def get_attributes(names_url: str):
	r = requests.get(names_url)

	attr_found, blank_count = [False] * 2
	attrs = {}
	for i, line in enumerate(r.text.split('\n')):
	if 'Attribute Information' in line:
	attr_found = True
	continue
	if attr_found and not line:
	blank_count += 1
	if blank_count==2:
	break
	continue
	if attr_found:
	attr = (line.replace(' ', '')
	.replace('\t', '')
	.replace('.', '').split(':'))
	attrs[attr[0]] = attr[1]

	return attrs
No results found