a-lakhanpal · April 16, 2021 08:08
diff --git a/main.py b/main.py
 from selenium.common.exceptions import NoSuchElementException    
 from selenium.webdriver.chrome.options import Options
 from selenium import webdriver
 from bs4 import BeautifulSoup
 import xlsxwriter
 import datetime
 import time
 import os

 def returnChromeDriver(pathToChromeDriver):
 	chrome_options = Options()
 	chrome_options.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1, 'profile.managed_default_content_settings.images': 1, 'profile.managed_default_content_settings.stylesheet': 2} )
 	chromedriver = pathToChromeDriver
 	userAgent = "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"
 	os.environ["webdriver.chrome.driver"] = chromedriver
 	driver = webdriver.Chrome(chromedriver, chrome_options = chrome_options)
 	return driver

 def returnSearchUrl(question):
 	baseGoogleQuery = "https://www.google.com/search?q="
 	searchUrl = baseGoogleQuery + question.lower().replace(" ", "+").replace("?", "%3F").replace("'", "%27")
 	return searchUrl

 def clickQuestions(driver, question, totalClicks):
 	searchUrl = returnSearchUrl(question)
 	driver.get(searchUrl)
 	time.sleep(1)

 	if driver.find_elements_by_css_selector('div.related-question-pair'):
 		questionIndex = 0
 		questions = driver.find_elements_by_css_selector('div.related-question-pair')
 		questions[-1].location_once_scrolled_into_view

 		while(questionIndex < totalClicks):
 			questions[questionIndex].click()
 			time.sleep(1)
 			questions = driver.find_elements_by_css_selector('div.related-question-pair')
 			questions[questionIndex + 1].location_once_scrolled_into_view
 			questionIndex = questionIndex + 1

 	return driver

 def extractQuestionData(soup):
 	questionList = []
 	for question in soup.findAll("div", class_="related-question-pair"):
 		questionDict = {}
 		questionDict['relatedQuestion'] = question.find("g-accordion-expander").find("div").text

 		if question.find("h3"):
 			questionDict['titleTag'] = question.find("h3").text
 			questionDict['titleTagLength'] = len(questionDict['titleTag'])
 		else:
 			questionDict['titleTag'] = "N/A - ERROR?"
 			questionDict['titleTagLength'] = "N/A - ERROR?"

 		if question.find("div", {"role":"heading"}):
 			questionDict['answer'] = question.find("div", {"role":"heading"}).text
 			questionDict['answerLength'] = len(questionDict['answer'])
 		else:
 			try:
 				questionDict['answer'] = question.find("g-accordion-expander").findAll("div")[2].text
 				questionDict['answerLength'] = len(questionDict['answer'])
 			except:
 				questionDict['answer'] = "N/A"
 				questionDict['answerLength'] = "N/A"					

 		if question.find("div", class_="r"):
 			questionDict['questionUrl'] = question.find("div", class_="r").find("a")['href']
 		else:
 			questionDict['questionUrl'] = "N/A - ERROR"

 		questionList.append(questionDict)

 	return questionList

 def writeExcelFile(allExtractedDataList):
 	date = datetime.datetime.now()
 	workbook = xlsxwriter.Workbook("data-" + date.strftime("%b-%d-%Y-%H-%M-%S") + ".xlsx", {'strings_to_urls': True})
 	worksheet01 = workbook.add_worksheet("Data")
 	worksheet01.write(0, 0, "Initial Question")
 	worksheet01.write(0, 1, "Related Question")
 	worksheet01.write(0, 2, "Title Tag")
 	worksheet01.write(0, 3, "Title Tag Length")
 	worksheet01.write(0, 4, "Answer")
 	worksheet01.write(0, 5, "Answer Length")
 	worksheet01.write(0, 6, "Question URL")

 	row = 1

 	for questionData in allExtractedDataList:
 		for relatedQuestion in questionData['relatedQuestionData']:
 			worksheet01.write(row, 0, questionData['initialQuestion'])
 			worksheet01.write(row, 1, relatedQuestion['relatedQuestion'])
 			worksheet01.write(row, 2, relatedQuestion['titleTag'])
 			worksheet01.write(row, 3, relatedQuestion['titleTagLength'])
 			worksheet01.write(row, 4, relatedQuestion['answer'])
 			worksheet01.write(row, 5, relatedQuestion['answerLength'])
 			worksheet01.write(row, 6, relatedQuestion['questionUrl'])
 			row = row + 1

 	workbook.close()

 questions = ["Who do you think you are?", "Where do you get the nerve?", "What's your sob story?"]
 pathToChromeDriver = "/path/to/chromedriver"
 totalClicks = 10
 driver = returnChromeDriver(pathToChromeDriver)

 allExtractedDataList = []

 for question in questions:
 	driver = clickQuestions(driver, question, totalClicks)
 	if driver.find_elements_by_css_selector('div.related-question-pair'):
 		soup = BeautifulSoup(driver.page_source, "lxml")
 		extractedData = extractQuestionData(soup)
 		currentQuestionDict = {'initialQuestion':question, 'relatedQuestionData':extractedData} 
 		allExtractedDataList.append(currentQuestionDict)

 	else:
 		print "No Questions Found For: " + question

 driver.quit()

 writeExcelFile(allExtractedDataList)
	from selenium.common.exceptions import NoSuchElementException
	from selenium.webdriver.chrome.options import Options
	from selenium import webdriver
	from bs4 import BeautifulSoup
	import xlsxwriter
	import datetime
	import time
	import os

	def returnChromeDriver(pathToChromeDriver):
	chrome_options = Options()
	chrome_options.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1, 'profile.managed_default_content_settings.images': 1, 'profile.managed_default_content_settings.stylesheet': 2} )
	chromedriver = pathToChromeDriver
	userAgent = "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"
	os.environ["webdriver.chrome.driver"] = chromedriver
	driver = webdriver.Chrome(chromedriver, chrome_options = chrome_options)
	return driver

	def returnSearchUrl(question):
	baseGoogleQuery = "https://www.google.com/search?q="
	searchUrl = baseGoogleQuery + question.lower().replace(" ", "+").replace("?", "%3F").replace("'", "%27")
	return searchUrl

	def clickQuestions(driver, question, totalClicks):
	searchUrl = returnSearchUrl(question)
	driver.get(searchUrl)
	time.sleep(1)

	if driver.find_elements_by_css_selector('div.related-question-pair'):
	questionIndex = 0
	questions = driver.find_elements_by_css_selector('div.related-question-pair')
	questions[-1].location_once_scrolled_into_view

	while(questionIndex < totalClicks):
	questions[questionIndex].click()
	time.sleep(1)
	questions = driver.find_elements_by_css_selector('div.related-question-pair')
	questions[questionIndex + 1].location_once_scrolled_into_view
	questionIndex = questionIndex + 1

	return driver

	def extractQuestionData(soup):
	questionList = []
	for question in soup.findAll("div", class_="related-question-pair"):
	questionDict = {}
	questionDict['relatedQuestion'] = question.find("g-accordion-expander").find("div").text

	if question.find("h3"):
	questionDict['titleTag'] = question.find("h3").text
	questionDict['titleTagLength'] = len(questionDict['titleTag'])
	else:
	questionDict['titleTag'] = "N/A - ERROR?"
	questionDict['titleTagLength'] = "N/A - ERROR?"

	if question.find("div", {"role":"heading"}):
	questionDict['answer'] = question.find("div", {"role":"heading"}).text
	questionDict['answerLength'] = len(questionDict['answer'])
	else:
	try:
	questionDict['answer'] = question.find("g-accordion-expander").findAll("div")[2].text
	questionDict['answerLength'] = len(questionDict['answer'])
	except:
	questionDict['answer'] = "N/A"
	questionDict['answerLength'] = "N/A"

	if question.find("div", class_="r"):
	questionDict['questionUrl'] = question.find("div", class_="r").find("a")['href']
	else:
	questionDict['questionUrl'] = "N/A - ERROR"

	questionList.append(questionDict)

	return questionList

	def writeExcelFile(allExtractedDataList):
	date = datetime.datetime.now()
	workbook = xlsxwriter.Workbook("data-" + date.strftime("%b-%d-%Y-%H-%M-%S") + ".xlsx", {'strings_to_urls': True})
	worksheet01 = workbook.add_worksheet("Data")
	worksheet01.write(0, 0, "Initial Question")
	worksheet01.write(0, 1, "Related Question")
	worksheet01.write(0, 2, "Title Tag")
	worksheet01.write(0, 3, "Title Tag Length")
	worksheet01.write(0, 4, "Answer")
	worksheet01.write(0, 5, "Answer Length")
	worksheet01.write(0, 6, "Question URL")

	row = 1

	for questionData in allExtractedDataList:
	for relatedQuestion in questionData['relatedQuestionData']:
	worksheet01.write(row, 0, questionData['initialQuestion'])
	worksheet01.write(row, 1, relatedQuestion['relatedQuestion'])
	worksheet01.write(row, 2, relatedQuestion['titleTag'])
	worksheet01.write(row, 3, relatedQuestion['titleTagLength'])
	worksheet01.write(row, 4, relatedQuestion['answer'])
	worksheet01.write(row, 5, relatedQuestion['answerLength'])
	worksheet01.write(row, 6, relatedQuestion['questionUrl'])
	row = row + 1

	workbook.close()

	questions = ["Who do you think you are?", "Where do you get the nerve?", "What's your sob story?"]
	pathToChromeDriver = "/path/to/chromedriver"
	totalClicks = 10
	driver = returnChromeDriver(pathToChromeDriver)

	allExtractedDataList = []

	for question in questions:
	driver = clickQuestions(driver, question, totalClicks)
	if driver.find_elements_by_css_selector('div.related-question-pair'):
	soup = BeautifulSoup(driver.page_source, "lxml")
	extractedData = extractQuestionData(soup)
	currentQuestionDict = {'initialQuestion':question, 'relatedQuestionData':extractedData}
	allExtractedDataList.append(currentQuestionDict)

	else:
	print "No Questions Found For: " + question

	driver.quit()

	writeExcelFile(allExtractedDataList)
No results found