Python 无法使用pymupdf搜索某些pdf
我写了一个litte程序来观察一个文件夹,一旦一个.pdf文件被放入该文件夹,它就会在.pdf中搜索关键字,并输出一个新的.txt(列出页码)和一个新的pdf文件,其中只包含包含关键字的页面 它对大多数PDF都很有效,但有些PDF显示出异常行为。似乎有时它只搜索第一页,而不搜索其他页面。如果需要,我可以提供这些PDF的链接 这是我的密码:Python 无法使用pymupdf搜索某些pdf,python,pymupdf,Python,Pymupdf,我写了一个litte程序来观察一个文件夹,一旦一个.pdf文件被放入该文件夹,它就会在.pdf中搜索关键字,并输出一个新的.txt(列出页码)和一个新的pdf文件,其中只包含包含关键字的页面 它对大多数PDF都很有效,但有些PDF显示出异常行为。似乎有时它只搜索第一页,而不搜索其他页面。如果需要,我可以提供这些PDF的链接 这是我的密码: import fitz, glob, os, time from watchdog.observers.polling import PollingObser
import fitz, glob, os, time
from watchdog.observers.polling import PollingObserver
from watchdog.events import PatternMatchingEventHandler
os.chdir("C:/test/")
s1 = ["Siphone"]
if __name__ == "__main__":
patterns = ["*.pdf"]
ignore_patterns = ["*done.pdf"]
ignore_directories = True
case_sensitive = True
my_event_handler = PatternMatchingEventHandler(patterns, ignore_patterns, ignore_directories, case_sensitive)
def on_created(event):
print("on_created", event.src_path)
time.sleep(2)
txt = "%s.txt" %event.src_path
open("%s" %event.src_path, 'r')
pdf_document = fitz.open(event.src_path)
out_file = "%s_done.pdf" %event.src_path
f = open("%s" %txt, "w")
bla = ""
for words in s1:
f = open("%s" % txt, "a")
f.write("%s:" % words)
for current_page in range(len(pdf_document)):
page = pdf_document.loadPage(current_page)
textsuche = page.searchFor(words)
if page.searchFor(words):
bla += (("%s,") % current_page)
seite = int(current_page)
seite += 1
f.write("%i," % seite)
f.write("\n")
liste = bla.split(",")
str_list = list(filter(None, liste))
str_list = list(dict.fromkeys(str_list))
test_list = [int(i) for i in str_list]
test_list.sort()
print(test_list)
doc = fitz.open()
for p in test_list:
doc.insertPDF(pdf_document, from_page=p, to_page=p)
output= ("%s_done.pdf" % event.src_path)
pdf_document.close()
for page in doc:
for i in s1:
text_instances = page.searchFor(i)
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
doc.save(output)
doc.close()
my_event_handler.on_created = on_created
path = "C:/test/"
go_recursively = True
my_observer = PollingObserver()
my_observer.schedule(my_event_handler, path, recursive=go_recursively)
my_observer.start()
while True:
try:
time.sleep(5)
except KeyboardInterrupt:
my_observer.stop()
my_observer.join()
以下错误出现在某些PDF上(我假设pymupdf无法正确读取文件,仅搜索第0页):
该词在pdf中多次出现,但找不到
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\threading.py", line 954, in _bootstrap_inner
self.run()
File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\observers\api.py", line 199, in run
self.dispatch_events(self.event_queue, self.timeout)
File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\observers\api.py", line 372, in dispatch_events
handler.dispatch(event)
File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\events.py", line 382, in dispatch
super().dispatch(event)
File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\watchdog\events.py", line 261, in dispatch
{
File "C:\all\pdf\final_pdf_suche.py", line 51, in on_created
doc.save(output)
File "C:\Users\mo\AppData\Local\Programs\Python\Python39\lib\site-packages\fitz\fitz.py", line 4206, in save
raise ValueError("cannot save with zero pages")
ValueError: cannot save with zero pages