I would like to extract all the questions and answers from this questionnaire but I can't click on the checkboxes:
<div class="freebirdFormviewerViewItemsCheckboxChoice"><label class="docssharedWizToggleLabeledContainer freebirdFormviewerViewItemsCheckboxContainer"><div class="docssharedWizToggleLabeledLabelWrapper exportLabelWrapper"><div class="quantumWizTogglePapercheckboxEl appsMaterialWizTogglePapercheckboxCheckbox docssharedWizToggleLabeledControl freebirdThemedCheckbox freebirdThemedCheckboxDarkerDisabled freebirdFormviewerViewItemsCheckboxControl isCheckedNext" jscontroller="EcW08c" jsaction="keydown:I481le;dyRcpb:dyRcpb;click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc(preventMouseEvents=true|preventDefault=true); touchcancel:JMtRjd;" jsshadow="" jsname="FkQz1b" aria-label="Conditions about promotions clearly shown" tabindex="0" aria-describedby=" i198" role="checkbox" aria-checked="false"><div class="quantumWizTogglePapercheckboxInk exportInk"></div><div class="quantumWizTogglePapercheckboxInnerBox exportInnerBox"></div><div class="quantumWizTogglePapercheckboxCheckMarkContainer"><div class="quantumWizTogglePapercheckboxCheckMark"><div class="quantumWizTogglePapercheckboxShort exportCheck"></div><div class="quantumWizTogglePapercheckboxLong exportCheck"></div></div></div></div><div class="docssharedWizToggleLabeledContent"><div class="docssharedWizToggleLabeledPrimaryText"><span dir="auto" class="docssharedWizToggleLabeledLabelText exportLabel freebirdFormviewerViewItemsCheckboxLabel">Conditions about promotions clearly shown</span></div></div></div></label></div>
Here I want to extract Conditions about promotions clearly shown
and I need to click on it as it is compulsory to go on the next page:
To click on them I tried:
btn_check_boxes = driver.find_elements_by_class_name(
"freebirdFormviewerViewItemsCheckboxChoice"
)
print("btn_check_boxes: ", btn_check_boxes)
for btn_check_box in btn_check_boxes:
btn_check_box.click()
break
But it doesn't work.
My whole code is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from selenium.common.exceptions import ElementNotInteractableException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common import exceptions
import pickle
import config
WDWTIME = 20
USER = config.username
PWD = config.password
def setup_chromedriver():
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome("C:\Programs\chromedriver.exe")
"""Some of the google forms need a login"""
url = 'https://www.google.com/accounts/'
driver.get(url)
# Find login field
login_field = WebDriverWait(driver, WDWTIME).until(
EC.presence_of_element_located((By.ID, 'identifierId')))
login_field.send_keys(USER)
# Click next button
driver.find_element_by_id('identifierNext').click()
# Find password field
time.sleep(4)
driver.set_page_load_timeout(50)
driver.set_script_timeout(50)
password_field = WebDriverWait(driver, WDWTIME).until(
EC.presence_of_element_located((By.ID, 'password')))
password_field = password_field.find_element_by_tag_name('input')
password_field.send_keys(PWD)
# Click next button
driver.find_element_by_id('passwordNext').click()
driver.set_page_load_timeout(30)
driver.set_script_timeout(30)
return driver
def load_data():
df = pd.read_csv("research_assistant_intern_recruitment_an.csv")
filter_col = ["Link"]
return df, filter_col
def get_published_questionnaire():
"""gets the questions and related answers of a google forms.
Returns:
dictionary: the dictionary of questions and answers successfully scraped.
"""
print("published questionnaire")
questionnaire = {}
btns = driver.find_elements_by_css_selector(".appsMaterialWizButtonEl")
# get "next" button, *warning* "request edit access" is also catched
next_btns = driver.find_elements_by_class_name("appsMaterialWizButtonPaperbuttonContent.exportButtonContent")
if next_btns:
next_btns[-1].click()
next_btns = driver.find_elements_by_class_name("appsMaterialWizButtonPaperbuttonContent.exportButtonContent")
# we iterate to find questions and click on the next page while there is a button we can click on
# *warning* for some google forms like
# https://docs.google.com/forms/d/e/1FAIpQLScWOjVVIKX9Qis2d0vCVpo3RuYqgiZ9TkD4BZm_eTvgVdvGNg/formResponse
# it creates an infinite loop
while next_btns != []:
containers = driver.find_elements_by_class_name(
"freebirdFormviewerViewNumberedItemContainer"
)
len_containers = len(containers)
for container in containers:
time.sleep(0.5)
len_containers -=1
print("len_containers: ", len_containers)
try:
time.sleep(0.5)
question = container.find_element_by_class_name(
"freebirdFormviewerViewItemsItemItemTitle.exportItemTitle.freebirdCustomFont"
)
except NoSuchElementException:
print("No question, NoSuchElementException")
continue
except exceptions.StaleElementReferenceException:
print("No question, StaleElementReferenceException")
continue
responses = container.find_elements_by_class_name(
"docssharedWizToggleLabeledLabelText"
)
extracted_text = [response.text for response in responses]
questionnaire[question.text] = extracted_text
# writing when compulsory
content_areas = driver.find_elements_by_class_name(
"quantumWizTextinputSimpleinputInput.exportInput"
)
for content_area in content_areas:
skip = ["Document title", "Titre du document", "Adresse e-mail valide"]
if content_area.get_attribute("aria-label") in skip and not content_area.get_attribute("aria-label").isspace():
print("We skip content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
else:
print("We also skip content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
content_area.send_keys("10102015")
content_areas = driver.find_elements_by_class_name(
"quantumWizTextinputPaperinputInput.exportInput"
)
for content_area in content_areas:
if content_area.get_attribute("type") == "date" and not content_area.get_attribute("type").isspace():
condition = content_area.get_attribute("type")
if condition == "date":
content_area.send_keys("10102015")
elif content_area.get_attribute("max") and not content_area.get_attribute("max").isspace():
max = content_area.get_attribute("max")
content_area.send_keys(max)
elif content_area.get_attribute("aria-label") and not content_area.get_attribute("aria-label").isspace():
condition = content_area.get_attribute("aria-label")
print("content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
if condition == "State (Two letter Abbreviation)":
content_area.send_keys("CA")
else:
content_area.send_keys("10102015")
for content_area in content_areas:
skip = ["Document title", "Titre du document", "Adresse e-mail valide"]
if content_area.get_attribute("aria-label") in skip and not content_area.get_attribute("aria-label").isspace():
print("content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
else:
print("content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
content_area.send_keys("10102015")
btns_answers = driver.find_elements_by_css_selector(".appsMaterialWizToggleRadiogroupElContainer")
for btn_answer in btns_answers:
try:
driver.execute_script('arguments[0].scrollIntoView(true);', btn_answer)
btn_answer.click()
except ElementNotInteractableException:
pass
except exceptions.ElementClickInterceptedException:
continue
# long answers
content_areas = driver.find_elements_by_class_name(
"quantumWizTextinputPapertextareaInput.exportTextarea"
)
for content_area in content_areas:
content_area.send_keys("This restaurant is really good! Me and my boyfriend went there on our holiday \
we had dinner there at 3 of February food was 100% And the service vas 150% And i really want to thank "
"\Asie for a really good service as for his coworkers. We highly recommended \
this restaurant!")
# check boxes
btn_check_boxes = driver.find_elements_by_class_name(
"docssharedWizToggleLabeledContainer.freebirdFormviewerViewItemsCheckboxContainer"
)
for btn_check_box in btn_check_boxes:
btn_check_box.click()
break
# btn_check_box[-1].click()
# # other weird check boxes
btn_check_boxes = driver.find_elements_by_class_name(
"docssharedWizToggleLabeledLabelText.exportLabel.freebirdFormviewerViewItemsCheckboxLabel"
)
for btn_check_box in btn_check_boxes:
btn_check_box.click()
break
# Clicking on text. *warning* : don't work
btn_check_boxes = driver.find_elements_by_class_name(
"freebirdFormviewerViewItemsCheckboxChoice"
)
print("btn_check_boxes: ", btn_check_boxes)
for btn_check_box in btn_check_boxes:
btn_check_box.click()
break
# btns[-1].click()
next_btns = driver.find_elements_by_class_name(
"appsMaterialWizButtonPaperbuttonContent.exportButtonContent")
if next_btns != []:
next_btns[-1].click()
next_btns = []
else:
continue
print("questionnaire: ", questionnaire)
return questionnaire
def get_backend_questionnaire():
print("backend questionnaire")
# sometimes we start with something that looks like a published page with a "next" button
# if driver.find_element_by_id('identifierNext'):
# driver.find_element_by_id('identifierNext').click()
questionnaire = {}
# I get all the cards with questions and answers inside
containers = driver.find_elements_by_class_name(
"freebirdFormeditorViewItemContentWrapper"
)
driver.set_page_load_timeout(30)
driver.set_script_timeout(30)
# for each card
for container in containers:
try:
# Get the question
# question = container.find_element_by_class_name(
# "appsMaterialWizTextinputTextareaInput.exportTextarea"
# )
question = container.find_element_by_css_selector(".exportTextarea[aria-label='Intitulé de la question']")
except NoSuchElementException:
print("NoSuchElementException in " + str(container))
continue
# Get the answers
responses = container.find_elements_by_css_selector(
".quantumWizTextinputSimpleinputInput.exportInput"
)
extracted_responses = [response.get_attribute("data-initial-value") for response in responses]
questionnaire[question.text] = extracted_responses
driver.set_page_load_timeout(30)
driver.set_script_timeout(30)
print("questionnaire backend: ", questionnaire)
return questionnaire
def extract(driver, df, survey):
count_questionnaires = 0
result = []
count_not_empty = 0.0
print("survey: ", survey)
# df = pd.DataFrame({"Link":["https://docs.google.com/forms/d/1_iRBtfJANF5MGWqoIMQUxBdeuAa4ePMltdIsVRmdY5Y/edit?usp=sharing"],
# "Task":["Hotel ABC"]}) # debugging StaleElementReferenceException
for location, task in zip(df.Link, df.Task):
if task == survey:
print("location: ", location)
questionnaire = {}
if "docs.google.com" in str(location):
count_questionnaires +=1.0
driver.get(location)
# test if it is a published version
try:
ask_access_btn = driver.find_elements_by_class_name(
"freebirdFormviewerViewNavigationHeaderButtonContent"
)
except exceptions.UnexpectedAlertPresentException:
print("UnexpectedAlertPresentException")
get_published_questionnaire
if ask_access_btn:
questionnaire = get_published_questionnaire()
else:
questionnaire = get_backend_questionnaire()
if questionnaire not in [{}, {'': ''}]:
count_not_empty += 1.0
result.append({str(count_questionnaires): questionnaire})
count_questionnaires += 1
print("count_questionnaires: ", count_questionnaires)
if count_questionnaires != 0:
print("count_not_empty/count_questionnaires: ", count_not_empty/count_questionnaires)
return result
if __name__ == '__main__':
""" Need to log on to the google account to access certain questionaires. Also Setup chromedriver to run in
headless state """
driver = setup_chromedriver()
published_questionnaires = [] # tracking published ones
""" Load CSV download of Google Sheet """
df, columns = load_data()
surveys = ['Hotel ABC', "Airline XYZ", "The Ministry of Tourism of France"]
for survey in surveys:
result = extract(driver, df, survey)
survey = survey.replace(" ", "_")
pickle_out = open("applicant" + survey + "_c.p", "wb")
pickle.dump(result, pickle_out)
pickle_out.close()
print("published_questionnaires: ", published_questionnaires)
The csv I'm loading is:
Link, Task
https://docs.google.com/forms/d/1j0nk_Oo-_pfJBM4UcWITDPXT97-qX5mZpb3uVyKS3CA/edit?usp=sharing,Hotel ABC
Aucun commentaire:
Enregistrer un commentaire