Python 从powerpoint文件中分离文本提取时遇到问题
我有一个从PowerPoints中提取文本的函数。但是,输出是一个大列表中所有powerpoint文件的所有文本。如何将文本分开,以便最终为我提取的两个powerpoint文件列出两个文本列表Python 从powerpoint文件中分离文本提取时遇到问题,python,python-3.7,Python,Python 3.7,我有一个从PowerPoints中提取文本的函数。但是,输出是一个大列表中所有powerpoint文件的所有文本。如何将文本分开,以便最终为我提取的两个powerpoint文件列出两个文本列表 text_runs = [] def pptx_collect(x): for file in pptx_files: prs = Presentation(file) for slide in prs.slides: for shape
text_runs = []
def pptx_collect(x):
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_runs.append(run.text)
return(text_runs)
def Powerpoint(pptx_files):
for name in pptx_files:
#print(name)
IP_list = (pptx_collect(name))
for item in IP_list:
#print(item)
keyword = re.findall(inp,item)
keyword1 = re.findall(inp1,item)
keyword2 = re.findall(word_search,item)
#print(ip_test)
file_dict['keyword'].append(keyword+keyword1+keyword2)
file_dict['name'].append(name.name[0:])
file_dict['created'].append(time.ctime(name.stat().st_ctime))
file_dict['modified'].append(time.ctime(name.stat().st_mtime))
file_dict['path'].append(name)
file_dict["content"].append(IP_list) #<--- This is where the
#problem is.
#print(file_dict)
return(file_dict)
Powerpoint(pptx_files)
我想得到:
['Billy’s ', 'pii', 'Just a test', '04/15/1991', '04.15.1991', '234-23-6456-billys ', 'SSN', 'Address: 58 bonnie ', 'rd', ', Boston, mass 07037', 'Text from second 2 ']
['Text from second ', 'powerpoint', ' ', '(second page)', 'Text from second 2 ', 'Text from second ', 'powerpoint', ' ', '(second page)', 'FOUO Test', 'Secret', 'This is a test to check ', 'for keywords']
函数的作用是遍历所有文件。试试这个:
def pptx_collect(x):
prs = Presentation(x)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_runs.append(run.text)
return(text_runs)
函数的作用是遍历所有文件。试试这个:
def pptx_collect(x):
prs = Presentation(x)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_runs.append(run.text)
return(text_runs)
我还建议在函数中定义text_运行
我还建议在函数中定义text_运行
def pptx_collect(x):
for file in pptx_files:
inner_list = []
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
inner_list.append(run.text)
text_runs.append(inner_list)
return(text_runs)