Python 无法使用pymupdf搜索某些pdf_Python_Pymupdf

Python 无法使用pymupdf搜索某些pdf

python

Python 无法使用pymupdf搜索某些pdf,python,pymupdf,Python,Pymupdf,我写了一个litte程序来观察一个文件夹，一旦一个.pdf文件被放入该文件夹，它就会在.pdf中搜索关键字，并输出一个新的.txt（列出页码）和一个新的pdf文件，其中只包含包含关键字的页面它对大多数PDF都很有效，但有些PDF显示出异常行为。似乎有时它只搜索第一页，而不搜索其他页面。如果需要，我可以提供这些PDF的链接这是我的密码： import fitz, glob, os, time from watchdog.observers.polling import PollingObser

我写了一个litte程序来观察一个文件夹，一旦一个.pdf文件被放入该文件夹，它就会在.pdf中搜索关键字，并输出一个新的.txt（列出页码）和一个新的pdf文件，其中只包含包含关键字的页面

它对大多数PDF都很有效，但有些PDF显示出异常行为。似乎有时它只搜索第一页，而不搜索其他页面。如果需要，我可以提供这些PDF的链接

这是我的密码：

import fitz, glob, os, time
from watchdog.observers.polling import PollingObserver
from watchdog.events import PatternMatchingEventHandler
os.chdir("C:/test/")
s1 = ["Siphone"]

if __name__ == "__main__":
    patterns = ["*.pdf"]
    ignore_patterns = ["*done.pdf"]
    ignore_directories = True
    case_sensitive = True
    my_event_handler = PatternMatchingEventHandler(patterns, ignore_patterns, ignore_directories, case_sensitive)

def on_created(event):
    print("on_created", event.src_path)
    time.sleep(2)
    txt = "%s.txt" %event.src_path
    open("%s" %event.src_path, 'r') 
    pdf_document = fitz.open(event.src_path)
    out_file = "%s_done.pdf" %event.src_path
    f = open("%s" %txt, "w")
    bla = ""
    for words in s1:
        f = open("%s" % txt, "a")
        f.write("%s:" % words)
        for current_page in range(len(pdf_document)):
            page = pdf_document.loadPage(current_page)
            textsuche = page.searchFor(words)
            if page.searchFor(words):
                bla += (("%s,") % current_page)
                seite = int(current_page)
                seite += 1
                f.write("%i," % seite)
        f.write("\n")
    liste = bla.split(",")
    str_list = list(filter(None, liste))
    str_list = list(dict.fromkeys(str_list))
    test_list = [int(i) for i in str_list]
    test_list.sort()
    print(test_list)
    doc = fitz.open()
    for p in test_list:
        doc.insertPDF(pdf_document, from_page=p, to_page=p)
    output= ("%s_done.pdf" % event.src_path)
    pdf_document.close()
    for page in doc:
        for i in s1:
            text_instances = page.searchFor(i)
            for inst in text_instances:
                highlight = page.addHighlightAnnot(inst)
    doc.save(output)
    doc.close()

my_event_handler.on_created = on_created

path = "C:/test/"
go_recursively = True
my_observer = PollingObserver()
my_observer.schedule(my_event_handler, path, recursive=go_recursively)
my_observer.start()
while True:
    try:
        time.sleep(5)
    except KeyboardInterrupt:
        my_observer.stop()
        my_observer.join()

以下错误出现在某些PDF上（我假设pymupdf无法正确读取文件，仅搜索第0页）：

该词在pdf中多次出现，但找不到

Exception in thread Thread-1:
Traceback (most recent call last):
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\threading.py", line 954, in _bootstrap_inner
   self.run()
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\observers\api.py", line 199, in run
   self.dispatch_events(self.event_queue, self.timeout)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\observers\api.py", line 372, in dispatch_events
   handler.dispatch(event)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\events.py", line 382, in dispatch
   super().dispatch(event)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\events.py", line 261, in dispatch
   {
 File "C:\all\pdf\final_pdf_suche.py", line 51, in on_created
   doc.save(output)
 File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\fitz\fitz.py", line 4206, in save
   raise ValueError("cannot save with zero pages")
ValueError: cannot save with zero pages