Amazon web services AWS文本摘要-不支持文档例外
在使用boto3 for python实现aws textract时 代码: 下面是aws的凭证和配置文件Amazon web services AWS文本摘要-不支持文档例外,amazon-web-services,amazon-textract,Amazon Web Services,Amazon Textract,在使用boto3 for python实现aws textract时 代码: 下面是aws的凭证和配置文件 niranjan@niranjan:~$ cat ~/.aws/credentials [default] aws_access_key_id=my_access_key_id aws_secret_access_key=my_secret_access_key niranjan@niranjan:~$ cat ~/.aws/config [default] region=eu-wes
niranjan@niranjan:~$ cat ~/.aws/credentials
[default]
aws_access_key_id=my_access_key_id
aws_secret_access_key=my_secret_access_key
niranjan@niranjan:~$ cat ~/.aws/config
[default]
region=eu-west-1
我得到一个例外:
---------------------------------------------------------------------------
UnsupportedDocumentException Traceback (most recent call last)
<ipython-input-11-f52c10e3f3db> in <module>
14
15 # Call Amazon Textract
---> 16 response = textract.detect_document_text(Document={'Bytes': imageBytes})
17
18 #print(response)
~/venv/lib/python3.7/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
314 "%s() only accepts keyword arguments." % py_operation_name)
315 # The "self" in this scope is referring to the BaseClient.
--> 316 return self._make_api_call(operation_name, kwargs)
317
318 _api_call.__name__ = str(py_operation_name)
~/venv/lib/python3.7/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
624 error_code = parsed_response.get("Error", {}).get("Code")
625 error_class = self.exceptions.from_code(error_code)
--> 626 raise error_class(parsed_response, operation_name)
627 else:
628 return parsed_response
UnsupportedDocumentException: An error occurred (UnsupportedDocumentException) when calling the DetectDocumentText operation: Request has unsupported document format
---------------------------------------------------------------------------
UnsupportedDocumentException回溯(最后一次最近调用)
在里面
14
15#打电话给Amazon Textract
--->16 response=textract.detect\u document\u text(document={'Bytes':imageBytes})
17
18#打印(回复)
调用中的~/venv/lib/python3.7/site-packages/botocore/client.py(self,*args,**kwargs)
314“%s()只接受关键字参数。“%py\u操作\u名称)
315#此范围中的“自我”指的是基本客户机。
-->316返回self.\u make\u api\u调用(操作名称,kwargs)
317
318\u api\u调用。\u名称\u=str(py\u操作\u名称)
~/venv/lib/python3.7/site-packages/botocore/client.py in\u make\u api\u调用(self、operation\u name、api\u参数)
624 error_code=解析的_响应.get(“error”,{}).get(“code”)
625错误\u类=self.exceptions.from\u代码(错误\u代码)
-->626引发错误\u类(解析的\u响应、操作\u名称)
627其他:
628返回解析的_响应
UnsupportedDocumentException:调用DetectDocumentText操作时发生错误(UnsupportedDocumentException):请求的文档格式不受支持
我对AWS textract有点陌生,任何帮助都将不胜感激。因为textract的API不支持“pdf”类型的文档,发送遇到的pdf
不支持文档格式异常
。尝试改为发送图像文件
如果您仍然想要发送pdf文件,那么您必须使用Textract的异步API。例如,StartDocumentAnalysis
API开始分析,以及GetDocumentAnalysis
获取分析过的文档
检测输入文档中的文本。Amazon Textract可以检测文本行和组成文本行的单词。输入文档必须是JPEG或PNG格式的图像。DetectDocumentText返回块对象数组中检测到的文本
请尝试此代码,并参考AWS的说明
---------------------------------------------------------------------------
UnsupportedDocumentException Traceback (most recent call last)
<ipython-input-11-f52c10e3f3db> in <module>
14
15 # Call Amazon Textract
---> 16 response = textract.detect_document_text(Document={'Bytes': imageBytes})
17
18 #print(response)
~/venv/lib/python3.7/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
314 "%s() only accepts keyword arguments." % py_operation_name)
315 # The "self" in this scope is referring to the BaseClient.
--> 316 return self._make_api_call(operation_name, kwargs)
317
318 _api_call.__name__ = str(py_operation_name)
~/venv/lib/python3.7/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
624 error_code = parsed_response.get("Error", {}).get("Code")
625 error_class = self.exceptions.from_code(error_code)
--> 626 raise error_class(parsed_response, operation_name)
627 else:
628 return parsed_response
UnsupportedDocumentException: An error occurred (UnsupportedDocumentException) when calling the DetectDocumentText operation: Request has unsupported document format
import boto3
import time
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
# Document
s3BucketName = "ki-textract-demo-docs"
documentName = "Amazon-Textract-Pdf.pdf"
jobId = startJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
#print(response)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print ('\033[94m' + item["Text"] + '\033[0m')