mardi 16 juin 2020

Clicking on checkbox with Selenium

I would like to extract all the questions and answers from this questionnaire but I can't click on the checkboxes:

<div class="freebirdFormviewerViewItemsCheckboxChoice"><label class="docssharedWizToggleLabeledContainer freebirdFormviewerViewItemsCheckboxContainer"><div class="docssharedWizToggleLabeledLabelWrapper exportLabelWrapper"><div class="quantumWizTogglePapercheckboxEl appsMaterialWizTogglePapercheckboxCheckbox docssharedWizToggleLabeledControl freebirdThemedCheckbox freebirdThemedCheckboxDarkerDisabled freebirdFormviewerViewItemsCheckboxControl isCheckedNext" jscontroller="EcW08c" jsaction="keydown:I481le;dyRcpb:dyRcpb;click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc(preventMouseEvents=true|preventDefault=true); touchcancel:JMtRjd;" jsshadow="" jsname="FkQz1b" aria-label="Conditions about promotions clearly shown" tabindex="0" aria-describedby="  i198" role="checkbox" aria-checked="false"><div class="quantumWizTogglePapercheckboxInk exportInk"></div><div class="quantumWizTogglePapercheckboxInnerBox exportInnerBox"></div><div class="quantumWizTogglePapercheckboxCheckMarkContainer"><div class="quantumWizTogglePapercheckboxCheckMark"><div class="quantumWizTogglePapercheckboxShort exportCheck"></div><div class="quantumWizTogglePapercheckboxLong exportCheck"></div></div></div></div><div class="docssharedWizToggleLabeledContent"><div class="docssharedWizToggleLabeledPrimaryText"><span dir="auto" class="docssharedWizToggleLabeledLabelText exportLabel freebirdFormviewerViewItemsCheckboxLabel">Conditions about promotions clearly shown</span></div></div></div></label></div>

Here I want to extract Conditions about promotions clearly shown and I need to click on it as it is compulsory to go on the next page:

To click on them I tried:

            btn_check_boxes = driver.find_elements_by_class_name(
                "freebirdFormviewerViewItemsCheckboxChoice"
            )
            print("btn_check_boxes: ", btn_check_boxes)
            for btn_check_box in btn_check_boxes:
                btn_check_box.click()
                break

But it doesn't work.

My whole code is:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from selenium.common.exceptions import ElementNotInteractableException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common import exceptions
import pickle

import config

WDWTIME = 20
USER = config.username
PWD = config.password

def setup_chromedriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome("C:\Programs\chromedriver.exe")
    """Some of the google forms need a login"""
    url = 'https://www.google.com/accounts/'
    driver.get(url)
    # Find login field
    login_field = WebDriverWait(driver, WDWTIME).until(
        EC.presence_of_element_located((By.ID, 'identifierId')))
    login_field.send_keys(USER)
    # Click next button
    driver.find_element_by_id('identifierNext').click()
    # Find password field
    time.sleep(4)
    driver.set_page_load_timeout(50)
    driver.set_script_timeout(50)
    password_field = WebDriverWait(driver, WDWTIME).until(
        EC.presence_of_element_located((By.ID, 'password')))
    password_field = password_field.find_element_by_tag_name('input')
    password_field.send_keys(PWD)
    # Click next button
    driver.find_element_by_id('passwordNext').click()
    driver.set_page_load_timeout(30)
    driver.set_script_timeout(30)
    return driver


def load_data():
    df = pd.read_csv("research_assistant_intern_recruitment_an.csv")
    filter_col = ["Link"]
    return df, filter_col


def get_published_questionnaire():
    """gets the questions and related answers of a google forms.

    Returns:
        dictionary: the dictionary of questions and answers successfully scraped.
    """
    print("published questionnaire")
    questionnaire = {}
    btns = driver.find_elements_by_css_selector(".appsMaterialWizButtonEl")
    # get "next" button, *warning* "request edit access" is also catched
    next_btns = driver.find_elements_by_class_name("appsMaterialWizButtonPaperbuttonContent.exportButtonContent")
    if next_btns:
        next_btns[-1].click()
        next_btns = driver.find_elements_by_class_name("appsMaterialWizButtonPaperbuttonContent.exportButtonContent")
        # we iterate to find questions and click on the next page while there is a button we can click on
        # *warning* for some google forms like
        # https://docs.google.com/forms/d/e/1FAIpQLScWOjVVIKX9Qis2d0vCVpo3RuYqgiZ9TkD4BZm_eTvgVdvGNg/formResponse
        # it creates an infinite loop
        while next_btns != []:
            containers = driver.find_elements_by_class_name(
                "freebirdFormviewerViewNumberedItemContainer"
            )
            len_containers = len(containers)
            for container in containers:
                time.sleep(0.5)
                len_containers -=1
                print("len_containers: ", len_containers)
                try:
                    time.sleep(0.5)
                    question = container.find_element_by_class_name(
                        "freebirdFormviewerViewItemsItemItemTitle.exportItemTitle.freebirdCustomFont"
                    )
                except NoSuchElementException:
                    print("No question, NoSuchElementException")
                    continue
                except exceptions.StaleElementReferenceException:
                    print("No question, StaleElementReferenceException")
                    continue

                responses = container.find_elements_by_class_name(
                    "docssharedWizToggleLabeledLabelText"
                )
                extracted_text = [response.text for response in responses]
                questionnaire[question.text] = extracted_text

                # writing when compulsory
                content_areas = driver.find_elements_by_class_name(
                    "quantumWizTextinputSimpleinputInput.exportInput"
                )
                for content_area in content_areas:
                    skip = ["Document title", "Titre du document", "Adresse e-mail valide"]
                    if content_area.get_attribute("aria-label") in skip and not content_area.get_attribute("aria-label").isspace():
                        print("We skip content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
                    else:
                        print("We also skip content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
                        content_area.send_keys("10102015")

                content_areas = driver.find_elements_by_class_name(
                    "quantumWizTextinputPaperinputInput.exportInput"
                )
                for content_area in content_areas:
                    if content_area.get_attribute("type") == "date" and not content_area.get_attribute("type").isspace():
                        condition = content_area.get_attribute("type")
                        if condition == "date":
                            content_area.send_keys("10102015")
                    elif content_area.get_attribute("max") and not content_area.get_attribute("max").isspace():
                        max = content_area.get_attribute("max")
                        content_area.send_keys(max)
                    elif content_area.get_attribute("aria-label") and not content_area.get_attribute("aria-label").isspace():
                        condition = content_area.get_attribute("aria-label")
                        print("content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
                        if condition == "State (Two letter Abbreviation)":
                            content_area.send_keys("CA")
                        else:
                            content_area.send_keys("10102015")

                for content_area in content_areas:
                    skip = ["Document title", "Titre du document", "Adresse e-mail valide"]
                    if content_area.get_attribute("aria-label") in skip and not content_area.get_attribute("aria-label").isspace():
                        print("content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
                    else:
                        print("content_area.get_attribute(\"aria-label\"): ", content_area.get_attribute("aria-label"))
                        content_area.send_keys("10102015")
                btns_answers = driver.find_elements_by_css_selector(".appsMaterialWizToggleRadiogroupElContainer")
                for btn_answer in btns_answers:
                    try:
                        driver.execute_script('arguments[0].scrollIntoView(true);', btn_answer)
                        btn_answer.click()
                    except ElementNotInteractableException:
                        pass
                    except exceptions.ElementClickInterceptedException:
                        continue

                # long answers
                content_areas = driver.find_elements_by_class_name(
                    "quantumWizTextinputPapertextareaInput.exportTextarea"
                )
                for content_area in content_areas:
                    content_area.send_keys("This restaurant is really good! Me and my boyfriend went there on our holiday \
                        we had dinner there at 3 of February food was 100% And the service vas 150% And i really want to thank "
                                           "\Asie for a really good service as for his coworkers. We highly recommended \
                                           this restaurant!")

                # check boxes
                btn_check_boxes = driver.find_elements_by_class_name(
                    "docssharedWizToggleLabeledContainer.freebirdFormviewerViewItemsCheckboxContainer"
                )
                for btn_check_box in btn_check_boxes:
                    btn_check_box.click()
                    break
                # btn_check_box[-1].click()

                # # other weird check boxes
                btn_check_boxes = driver.find_elements_by_class_name(
                    "docssharedWizToggleLabeledLabelText.exportLabel.freebirdFormviewerViewItemsCheckboxLabel"
                )
                for btn_check_box in btn_check_boxes:
                    btn_check_box.click()
                    break

                # Clicking on text. *warning* : don't work
                btn_check_boxes = driver.find_elements_by_class_name(
                    "freebirdFormviewerViewItemsCheckboxChoice"
                )
                print("btn_check_boxes: ", btn_check_boxes)
                for btn_check_box in btn_check_boxes:
                    btn_check_box.click()
                    break

                # btns[-1].click()
                next_btns = driver.find_elements_by_class_name(
                    "appsMaterialWizButtonPaperbuttonContent.exportButtonContent")
                if next_btns != []:
                    next_btns[-1].click()
                    next_btns = []
                else:
                    continue

    print("questionnaire: ", questionnaire)
    return questionnaire


def get_backend_questionnaire():
    print("backend questionnaire")
    # sometimes we start with something that looks like a published page with a "next" button
    # if driver.find_element_by_id('identifierNext'):
    #     driver.find_element_by_id('identifierNext').click()
    questionnaire = {}
    # I get all the cards with questions and answers inside
    containers = driver.find_elements_by_class_name(
        "freebirdFormeditorViewItemContentWrapper"
    )
    driver.set_page_load_timeout(30)
    driver.set_script_timeout(30)
    # for each card
    for container in containers:
        try:
            # Get the question
            # question = container.find_element_by_class_name(
            #     "appsMaterialWizTextinputTextareaInput.exportTextarea"
            # )
            question = container.find_element_by_css_selector(".exportTextarea[aria-label='Intitulé de la question']")
        except NoSuchElementException:
            print("NoSuchElementException in " + str(container))
            continue
        # Get the answers
        responses = container.find_elements_by_css_selector(
            ".quantumWizTextinputSimpleinputInput.exportInput"
        )
        extracted_responses = [response.get_attribute("data-initial-value") for response in responses]
        questionnaire[question.text] = extracted_responses

        driver.set_page_load_timeout(30)
        driver.set_script_timeout(30)

    print("questionnaire backend: ", questionnaire)
    return questionnaire


def extract(driver, df, survey):
    count_questionnaires = 0
    result = []
    count_not_empty = 0.0
    print("survey: ", survey)
    # df = pd.DataFrame({"Link":["https://docs.google.com/forms/d/1_iRBtfJANF5MGWqoIMQUxBdeuAa4ePMltdIsVRmdY5Y/edit?usp=sharing"],
    #                   "Task":["Hotel ABC"]}) # debugging StaleElementReferenceException
    for location, task in zip(df.Link, df.Task):
        if task == survey:
            print("location: ", location)
            questionnaire = {}
            if "docs.google.com" in str(location):
                count_questionnaires +=1.0
                driver.get(location)
                # test if it is a published version
                try:
                    ask_access_btn = driver.find_elements_by_class_name(
                        "freebirdFormviewerViewNavigationHeaderButtonContent"
                        )
                except exceptions.UnexpectedAlertPresentException:
                    print("UnexpectedAlertPresentException")
                    get_published_questionnaire
                if ask_access_btn:
                    questionnaire = get_published_questionnaire()
                else:
                    questionnaire = get_backend_questionnaire()
            if questionnaire not in [{}, {'': ''}]:
                count_not_empty += 1.0
            result.append({str(count_questionnaires): questionnaire})
            count_questionnaires += 1
    print("count_questionnaires: ", count_questionnaires)
    if count_questionnaires != 0:
        print("count_not_empty/count_questionnaires: ", count_not_empty/count_questionnaires)
    return result


if __name__ == '__main__':
    """ Need to log on to the google account to access certain questionaires. Also Setup chromedriver to run in 
    headless state """
    driver = setup_chromedriver()
    published_questionnaires = [] # tracking published ones
    """ Load CSV download of Google Sheet """
    df, columns = load_data()
    surveys = ['Hotel ABC', "Airline XYZ", "The Ministry of Tourism of France"]
    for survey in surveys:
        result = extract(driver, df, survey)
        survey = survey.replace(" ", "_")
        pickle_out = open("applicant" + survey + "_c.p", "wb")
        pickle.dump(result, pickle_out)
        pickle_out.close()
    print("published_questionnaires: ", published_questionnaires)

The csv I'm loading is:

Link, Task
https://docs.google.com/forms/d/1j0nk_Oo-_pfJBM4UcWITDPXT97-qX5mZpb3uVyKS3CA/edit?usp=sharing,Hotel ABC



Aucun commentaire:

Enregistrer un commentaire