I'm trying to make a program that scans PDFs downloaded from a website with selectable text and highlights specific discrepancies. I can make it work for specific "bad words" and "good words" but I am stuck on how to make it find missing check boxes. They are no longer interactive fields in PDF form: enter image description here
Here is my code for everything else so far:
import os
import fitz
source_folder = r"C:\Users\Sserb\Desktop\Test Files"
list_files = os.listdir(source_folder)
good_terms = ["trend", "decrease", "increase"]
bad_terms = ["school", "academic", "homework"] # words that should be in every pdf file (not every page)
pdf_files = [x for x in list_files if x.endswith(".pdf")]
highlight_summary = []
good_term_summary = []
for file_name in pdf_files:
# READ IN PDF
full_filename = os.path.join(source_folder, file_name)
doc = fitz.open(full_filename)
good_terms_not_found = good_terms.copy()
list_hl_pages = []
for page_num, page in enumerate(doc, 1):
# SEARCH
for text in bad_terms:
text_instances = page.search_for(text)
# HIGHLIGHT
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
highlight.update()
if page_num not in list_hl_pages:
list_hl_pages.append(page_num)
# Search for good terms- all must be found
words_found = []
for good_word in good_terms_not_found:
text_instances = page.search_for(good_word)
if text_instances:
words_found.append(good_word)
for word in words_found:
good_terms_not_found.remove(word)
highlight_summary.append([file_name, list_hl_pages.copy()])
if good_terms_not_found:
good_term_summary.append([file_name, good_terms_not_found.copy()])
# OUTPUT
if list_hl_pages:
out_file = file_name.replace(".pdf", "-errors.pdf")
doc.save(os.path.join(source_folder, "output", out_file), garbage=4, deflate=True, clean=True)
else:
doc.close()
#print(highlight_summary)
print(good_term_summary)
output_folder=r"C:\Users\Sserb\Desktop\Test Files\output"
new = os.path.join(output_folder,'outputfile.txt')
file = open(new, 'w')
value = str(good_term_summary) + '\n'
file.write(value)
file.close()
Aucun commentaire:
Enregistrer un commentaire