dimanche 29 août 2021

Python - Check Box Status Program

I'm trying to make a program that scans PDFs downloaded from a website with selectable text and highlights specific discrepancies. I can make it work for specific "bad words" and "good words" but I am stuck on how to make it find missing check boxes. They are no longer interactive fields in PDF form: enter image description here

Here is my code for everything else so far:

import os

import fitz

source_folder = r"C:\Users\Sserb\Desktop\Test Files"
list_files = os.listdir(source_folder)

good_terms = ["trend", "decrease", "increase"]
bad_terms = ["school", "academic", "homework"]  # words that should be in every pdf file (not every page)

pdf_files = [x for x in list_files if x.endswith(".pdf")]

highlight_summary = []

good_term_summary = []

for file_name in pdf_files:
    # READ IN PDF
    full_filename = os.path.join(source_folder, file_name)
    doc = fitz.open(full_filename)

    good_terms_not_found = good_terms.copy()

    list_hl_pages = []
    for page_num, page in enumerate(doc, 1):

        # SEARCH
        for text in bad_terms:
            text_instances = page.search_for(text)

            # HIGHLIGHT
            for inst in text_instances:
                highlight = page.addHighlightAnnot(inst)
                highlight.update()
                if page_num not in list_hl_pages:
                    list_hl_pages.append(page_num)

        # Search for good terms- all must be found
        words_found = []
        for good_word in good_terms_not_found:
            text_instances = page.search_for(good_word)
            if text_instances:
                words_found.append(good_word)

        for word in words_found:
            good_terms_not_found.remove(word)

    highlight_summary.append([file_name, list_hl_pages.copy()])
    if good_terms_not_found:
        good_term_summary.append([file_name, good_terms_not_found.copy()])

    # OUTPUT
    if list_hl_pages:
        out_file = file_name.replace(".pdf", "-errors.pdf")
        doc.save(os.path.join(source_folder, "output", out_file), garbage=4, deflate=True, clean=True)
    else:
        doc.close()

#print(highlight_summary)
print(good_term_summary)
output_folder=r"C:\Users\Sserb\Desktop\Test Files\output"
new = os.path.join(output_folder,'outputfile.txt')
file = open(new, 'w')
value = str(good_term_summary) + '\n'
file.write(value)
file.close()



Aucun commentaire:

Enregistrer un commentaire