Skip to content

Instantly share code, notes, and snippets.

@Sofrosyn
Created January 26, 2024 10:55
Show Gist options
  • Select an option

  • Save Sofrosyn/a6d55b5500a59c70b76dab8784672e80 to your computer and use it in GitHub Desktop.

Select an option

Save Sofrosyn/a6d55b5500a59c70b76dab8784672e80 to your computer and use it in GitHub Desktop.
Script to get courses
import time
import pandas as pd
from selenium.webdriver.common.by import By
import config
from src.room.room import session, University
all_university = []
"""
A function to scrape the list of all universities.
Saves into an sqlite database
"""
def scrape_universities():
for page in range(1, 27):
url = f"https://www.thecompleteuniversityguide.co.uk/universities?pg={page}"
driver = config.configure_web_driver(url)
# Scrape the university names
university_elements = driver.find_elements(By.TAG_NAME, "h3")
for university_name in university_elements:
if len(university_name.text.strip()) > 1:
print(university_name.text, end="\n")
session.add(University(university_name=university_name.text))
session.commit()
driver.close()
time.sleep(5.0)
"""
A function that counts the number of sheets in a spreadsheet file
"""
def count_sheets():
xl = pd.ExcelFile('courses.xlsx')
sheet_names = xl.sheet_names
for names in sheet_names:
print(f'{names}\n')
print(f'total : {len(sheet_names)}')
def scrape_courses(university_name: str, max_pages: int):
course_list = []
max_pages = max_pages + 1
for page in range(1, max_pages):
print(f'------ fetching page {page} ---------------')
name = university_name.lower().replace(' ', '-')
url = f"https://www.thecompleteuniversityguide.co.uk/courses/university-search/undergraduate/all/{name}?pg={page}"
driver = config.configure_web_driver(url)
courses = driver.find_elements(By.CLASS_NAME, 'pr_shrt')
for course in courses:
if len(course.text.strip()) > 1:
course_list.append(course.text)
print(course.text)
driver.close()
time.sleep(5)
df = pd.DataFrame(course_list, columns=['Courses'])
with pd.ExcelWriter('courses.xlsx', mode='a', if_sheet_exists='replace') as writer:
df.to_excel(writer, sheet_name=university_name)
print(f'{university_name} sheet saved')
def export_university():
export_list = []
universities = session.query(University).all()
for uni in universities:
export_list.append(uni.university_name)
print(uni.university_name)
df = pd.DataFrame(export_list, columns=["University"])
df.to_excel('university.xlsx', sheet_name='Uk University')
print("Export complete")
if __name__ == "__main__":
# for university, page in mainList.items():
# print(f'university -> {university}, page -> {page}')
# scrape_courses(university_name=university, max_pages=page)
count_sheets()
# scrape_universities()
# export_university()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment