Skip to content

Instantly share code, notes, and snippets.

@PostPCEra
Forked from gitdagray/kanview_complete.py
Last active September 25, 2020 16:21
Show Gist options
  • Select an option

  • Save PostPCEra/7a07a58a801e8a26461783ed7bc108a4 to your computer and use it in GitHub Desktop.

Select an option

Save PostPCEra/7a07a58a801e8a26461783ed7bc108a4 to your computer and use it in GitHub Desktop.
Full Python code for web scraping Kanview utilizing Selenium, Beautiful Soup, and pandas
// source : https://date-fns.org/
import { addYears, formatWithOptions, toUpper } from 'date-fns/fp'
import { eo } from 'date-fns/locale'
const addFiveYears = addYears(5)
const dateToString = formatWithOptions({ locale: eo }, 'D MMMM YYYY')
const dates = [
new Date(2017, 0, 1),
new Date(2017, 1, 11),
new Date(2017, 6, 2)
]
const toUpper = arg => String(arg).toUpperCase()
const formattedDates = dates.map(addFiveYears).map(dateToString).map(toUpper)
//=> ['1 JANUARO 2022', '11 FEBRUARO 2022', '2 JULIO 2022']
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
from tabulate import tabulate
import os
#launch url
url = "http://kanview.ks.gov/PayRates/PayRates_Agency.aspx"
# create a new Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)
#After opening the url above, Selenium clicks the specific agency link
python_button = driver.find_element_by_id('MainContent_uxLevel1_Agencies_uxAgencyBtn_33') #FHSU
python_button.click() #click fhsu link
#Selenium hands the page source to Beautiful Soup
soup_level1=BeautifulSoup(driver.page_source, 'lxml')
datalist = [] #empty list
x = 0 #counter
#Beautiful Soup finds all Job Title links on the agency page and the loop begins
for link in soup_level1.find_all('a', id=re.compile("^MainContent_uxLevel2_JobTitles_uxJobTitleBtn_")):
#Selenium visits each Job Title page
python_button = driver.find_element_by_id('MainContent_uxLevel2_JobTitles_uxJobTitleBtn_' + str(x))
python_button.click() #click link
#Selenium hands of the source of the specific job page to Beautiful Soup
soup_level2=BeautifulSoup(driver.page_source, 'lxml')
#Beautiful Soup grabs the HTML table on the page
table = soup_level2.find_all('table')[0]
#Giving the HTML table to pandas to put in a dataframe object
df = pd.read_html(str(table),header=0)
#Store the dataframe in a list
datalist.append(df[0])
#Ask Selenium to click the back button
driver.execute_script("window.history.go(-1)")
#increment the counter variable before starting the loop over
x += 1
#end loop block
#loop has completed
#end the Selenium browser session
driver.quit()
#combine all pandas dataframes in the list into one big dataframe
result = pd.concat([pd.DataFrame(datalist[i]) for i in range(len(datalist))],ignore_index=True)
#convert the pandas dataframe to JSON
json_records = result.to_json(orient='records')
#pretty print to CLI with tabulate
#converts to an ascii table
print(tabulate(result, headers=["Employee Name","Job Title","Overtime Pay","Total Gross Pay"],tablefmt='psql'))
#get current working directory
path = os.getcwd()
#open, write, and close the file
f = open(path + "\\fhsu_payroll_data.json","w") #FHSU
f.write(json_records)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment