import requests import re from bs4 import BeautifulSoup import json import pru_14_json # https://calon.spr.gov.my/pru14_json.js import time import csv import operator # Full list of Parlimen seats, :https://calon.spr.gov.my/pru14_json.js parlimen_seats = [] for state in pru_14_json.json_data: for seat in pru_14_json.json_data[state]['parlimen']: parlimen_seats.append(seat) # Full list of state seats state_seats = [] for state in pru_14_json.json_data: for seat in pru_14_json.json_data[state]['dun']: state_seats.append(seat) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': '_ga=GA1.3.1001500445.1524900877; _gid=GA1.3.593399681.1524900877; PHPSESSID=56433b80694baae3aad91e66563c3484', 'DNT': '1', 'Host': 'calon.spr.gov.my', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0' } # Initial get for first token r = requests.post('https://calon.spr.gov.my/', headers=headers) soup = BeautifulSoup(r.content, "html.parser") token = soup.find('input', {'id': 'to_spr_ken'}).get('value') # Ok let's get those names For parlimen records = [] for seat in parlimen_seats: payload = {'kod': str(seat['id']), 'token': token} r = requests.post(url='https://calon.spr.gov.my/ajax.php', data=payload, headers=headers, allow_redirects=True) response_json = json.loads(r.text) for calon in response_json['calon']: record = { 'seat': seat, 'calon': calon, 'id': seat['kerusi_id']} records.append(record) print(record) token = response_json['token'] # refresh token time.sleep(1) # don't overload the site hashtag:#responsibleScraper records.sort(key=operator.itemgetter('id')) # sort by seat id, e.g P.001 # Write out to file with open("election_results/parlimen.csv", "w") as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(["Seat ID", "Seat Name", "Candidate Name", "Candidate Ballot Name", "Candidate Party"]) for record in records: csv_writer.writerow([record['seat']['kerusi_id'], record['seat']['name'], record['calon']['nama'], record['calon']['nama_undi'], record['calon']['parti'] ]) # and now states records = [] for seat in state_seats: payload = {'kod': str(seat['id']), 'token': token} r = requests.post(url='https://calon.spr.gov.my/ajax.php', data=payload, headers=headers, allow_redirects=True) response_json = json.loads(r.text) for calon in response_json['calon']: record = { 'seat': seat, 'calon': calon, 'id': seat['state_id']} records.append(record) print (record) token = response_json['token'] # refresh token time.sleep(1) # responsible Scraper records.sort(key=operator.itemgetter('id')) # sort by state_id with open("election_results/state.csv", "w") as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(["State", "Seat ID", "Seat Name", "Candidate Name", "Candidate Ballot Name", "Candidate Party"]) for record in records: csv_writer.writerow([pru_14_json.state_mapping[record['seat']['state_id']], record['seat']['kerusi_id'], record['seat']['name'], record['calon']['nama'], record['calon']['nama_undi'], record['calon']['parti'] ]) # end like a boss print("Keith is awesome!")