Amazon dynamodb 如何使用数据管道导出具有按需供应的DynamoDB表

Amazon dynamodb 如何使用数据管道导出具有按需供应的DynamoDB表,amazon-dynamodb,amazon-data-pipeline,Amazon Dynamodb,Amazon Data Pipeline,我曾经使用名为Export DynamoDB table to S3的数据管道模板将DynamoDB表导出到文件。我最近更新了我所有的DynamoDB表,使其具有按需供应,并且模板不再工作。我很确定这是因为旧模板指定了要使用的DynamoDB吞吐量的百分比,这与按需表无关 我尝试将旧模板导出到JSON,删除对吞吐量百分比消耗的引用,并创建一个新管道。然而,这是不成功的 有人能建议如何将具有吞吐量规定的旧式管道脚本转换为新的按需表脚本吗 以下是我的原始功能脚本: { "objects": [

我曾经使用名为
Export DynamoDB table to S3
的数据管道模板将DynamoDB表导出到文件。我最近更新了我所有的DynamoDB表,使其具有按需供应,并且模板不再工作。我很确定这是因为旧模板指定了要使用的DynamoDB吞吐量的百分比,这与按需表无关

我尝试将旧模板导出到JSON,删除对吞吐量百分比消耗的引用,并创建一个新管道。然而,这是不成功的

有人能建议如何将具有吞吐量规定的旧式管道脚本转换为新的按需表脚本吗

以下是我的原始功能脚本:

{
  "objects": [
    {
      "name": "DDBSourceTable",
      "id": "DDBSourceTable",
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    },
    {
      "name": "EmrClusterForBackup",
      "coreInstanceCount": "1",
      "coreInstanceType": "m3.xlarge",
      "releaseLabel": "emr-5.13.0",
      "masterInstanceType": "m3.xlarge",
      "id": "EmrClusterForBackup",
      "region": "#{myDDBRegion}",
      "type": "EmrCluster"
    },
    {
      "failureAndRerunMode": "CASCADE",
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "scheduleType": "ONDEMAND",
      "name": "Default",
      "id": "Default"
    },
    {
      "output": {
        "ref": "S3BackupLocation"
      },
      "input": {
        "ref": "DDBSourceTable"
      },
      "maximumRetries": "2",
      "name": "TableBackupActivity",
      "step": "s3://dynamodb-emr-#{myDDBRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}",
      "id": "TableBackupActivity",
      "runsOn": {
        "ref": "EmrClusterForBackup"
      },
      "type": "EmrActivity",
      "resizeClusterBeforeRunning": "true"
    },
    {
      "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
      "name": "S3BackupLocation",
      "id": "S3BackupLocation",
      "type": "S3DataNode"
    }
  ],
  "parameters": [
    {
      "description": "Output S3 folder",
      "id": "myOutputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "description": "Source DynamoDB table name",
      "id": "myDDBTableName",
      "type": "String"
    },
    {
      "default": "0.25",
      "watermark": "Enter value between 0.1-1.0",
      "description": "DynamoDB read throughput ratio",
      "id": "myDDBReadThroughputRatio",
      "type": "Double"
    },
    {
      "default": "us-east-1",
      "watermark": "us-east-1",
      "description": "Region of the DynamoDB table",
      "id": "myDDBRegion",
      "type": "String"
    }
  ],
  "values": {
    "myDDBRegion": "us-east-1",
    "myDDBTableName": "LIVE_Invoices",
    "myDDBReadThroughputRatio": "0.25",
    "myOutputS3Loc": "s3://company-live-extracts/"
  }
}
以下是我尝试的更新失败:

{
  "objects": [
    {
      "name": "DDBSourceTable",
      "id": "DDBSourceTable",
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    },
    {
      "name": "EmrClusterForBackup",
      "coreInstanceCount": "1",
      "coreInstanceType": "m3.xlarge",
      "releaseLabel": "emr-5.13.0",
      "masterInstanceType": "m3.xlarge",
      "id": "EmrClusterForBackup",
      "region": "#{myDDBRegion}",
      "type": "EmrCluster"
    },
    {
      "failureAndRerunMode": "CASCADE",
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "scheduleType": "ONDEMAND",
      "name": "Default",
      "id": "Default"
    },
    {
      "output": {
        "ref": "S3BackupLocation"
      },
      "input": {
        "ref": "DDBSourceTable"
      },
      "maximumRetries": "2",
      "name": "TableBackupActivity",
      "step": "s3://dynamodb-emr-#{myDDBRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName}",
      "id": "TableBackupActivity",
      "runsOn": {
        "ref": "EmrClusterForBackup"
      },
      "type": "EmrActivity",
      "resizeClusterBeforeRunning": "true"
    },
    {
      "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
      "name": "S3BackupLocation",
      "id": "S3BackupLocation",
      "type": "S3DataNode"
    }
  ],
  "parameters": [
    {
      "description": "Output S3 folder",
      "id": "myOutputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "description": "Source DynamoDB table name",
      "id": "myDDBTableName",
      "type": "String"
    },
    {
      "default": "us-east-1",
      "watermark": "us-east-1",
      "description": "Region of the DynamoDB table",
      "id": "myDDBRegion",
      "type": "String"
    }
  ],
  "values": {
    "myDDBRegion": "us-east-1",
    "myDDBTableName": "LIVE_Invoices",
    "myOutputS3Loc": "s3://company-live-extracts/"
  }
}
下面是数据管道执行的错误:

at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:322) at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:198) at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1341) at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1338) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1836) at org.apache.hadoop.mapreduce.Job.submit(Job.java:1338) at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:575) at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:570) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1836) at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:570) at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java

我在这方面向AWS开了一张支持票。他们的反应相当全面。我会把它贴在下面


感谢您就这个问题与我们联系

不幸的是,DynamoDB的数据管道导出/导入作业不支持DynamoDB的新的随需应变模式[1]

使用按需容量的表没有为读写单元定义的容量。在计算管道的吞吐量时,数据管道依赖于此定义的容量

例如,如果您有100个RCU(读取容量单位),且管道吞吐量为0.25(25%),则有效管道吞吐量将为每秒25个读取单位(100*0.25)。 但是,在按需容量的情况下,RCU和WCU(写入容量单位)反映为0。无论管道吞吐量值如何,计算的有效吞吐量均为0

当有效吞吐量小于1时,管道将不执行

是否需要将DynamoDB表导出到S3

如果您仅将这些表导出用于备份目的,我建议您使用DynamoDB的按需备份和恢复功能(一个与按需容量相似的名称)[2]

请注意,按需备份不会影响表的吞吐量,只需几秒钟即可完成。您只需支付与备份相关的S3存储成本。 但是,客户无法直接访问这些表备份,只能将其还原到源表。如果您希望对备份数据执行分析,或将数据导入其他系统、帐户或表,则此备份方法不适用

如果您需要使用数据管道导出DynamoDB数据,那么唯一的方法是将表设置为配置容量模式

您可以手动执行此操作,或者使用AWS CLI命令将其作为活动包含在管道中[3]

例如(按需支付也称为按请求支付模式):

-

请注意,禁用按需容量模式后,需要等待24小时才能再次启用

==参考链接===

[1] DynamoDB按需容量(另请参阅关于不支持的服务/工具的说明):

[2] DynamoDB按需备份和恢复:


[3] DynamoDB“更新表格”的AWS CLI参考:今年早些时候,DDB导出工具中增加了对按需表格的支持:

我能够在S3上安装该工具的更新版本,并更新管道中的一些内容以使其正常工作:

{
  "objects": [
    {
      "output": {
        "ref": "S3BackupLocation"
      },
      "input": {
        "ref": "DDBSourceTable"
      },
      "maximumRetries": "2",
      "name": "TableBackupActivity",
      "step": "s3://<your-tools-bucket>/emr-dynamodb-tools-4.11.0-SNAPSHOT.jar,org.apache.hadoop.dynamodb.tools.DynamoDBExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}",
      "id": "TableBackupActivity",
      "runsOn": {
        "ref": "EmrClusterForBackup"
      },
      "type": "EmrActivity",
      "resizeClusterBeforeRunning": "true"
    },
    {
      "failureAndRerunMode": "CASCADE",
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "pipelineLogUri": "s3://<your-log-bucket>/",
      "scheduleType": "ONDEMAND",
      "name": "Default",
      "id": "Default"
    },
    {
      "readThroughputPercent": "#{myDDBReadThroughputRatio}",
      "name": "DDBSourceTable",
      "id": "DDBSourceTable",
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    },
    {
      "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
      "name": "S3BackupLocation",
      "id": "S3BackupLocation",
      "type": "S3DataNode"
    },
    {
      "name": "EmrClusterForBackup",
      "coreInstanceCount": "1",
      "coreInstanceType": "m3.xlarge",
      "releaseLabel": "emr-5.26.0",
      "masterInstanceType": "m3.xlarge",
      "id": "EmrClusterForBackup",
      "region": "#{myDDBRegion}",
      "type": "EmrCluster",
      "terminateAfter": "1 Hour"
    }
  ],
  "parameters": [
    {
      "description": "Output S3 folder",
      "id": "myOutputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "description": "Source DynamoDB table name",
      "id": "myDDBTableName",
      "type": "String"
    },
    {
      "default": "0.25",
      "watermark": "Enter value between 0.1-1.0",
      "description": "DynamoDB read throughput ratio",
      "id": "myDDBReadThroughputRatio",
      "type": "Double"
    },
    {
      "default": "us-east-1",
      "watermark": "us-east-1",
      "description": "Region of the DynamoDB table",
      "id": "myDDBRegion",
      "type": "String"
    }
  ],
  "values": {
    "myDDBRegion": "us-west-2",
    "myDDBTableName": "<your table name>",
    "myDDBReadThroughputRatio": "0.5",
    "myOutputS3Loc": "s3://<your-output-bucket>/"
  }
}
{
“对象”:[
{
“产出”:{
“ref”:“S3反向上传”
},
“输入”:{
“ref”:“DDB来源”
},
“最大重试次数”:“2”,
“名称”:“TableBackupActivity”,
“步骤”:“s3:///emr-dynamodb-tools-4.11.0-SNAPSHOT.jar,org.apache.hadoop.dynamodb.tools.DynamoDBExport,#{output.directoryPath},#{input.tableName},#{input.readthroughput}”,
“id”:“TableBackupActivity”,
“runsOn”:{
“ref”:“EmrClusterForBackup”
},
“类型”:“实用性”,
“在运行之前调整群集大小”:“true”
},
{
“故障和运行模式”:“级联”,
“resourceRole”:“DataPipelineDefaultResourceRole”,
“角色”:“DataPipelineDefaultRole”,
“pipelineLogUri”:“s3://”,
“scheduleType”:“ONDEMAND”,
“名称”:“默认值”,
“id”:“默认值”
},
{
“readthroughput percent”:“{myddbreadthroughputtratio}”,
“名称”:“DDB来源表”,
“id”:“DDB来源”,
“类型”:“DynamoDBDataNode”,
“表名”:“#{myDDBTableName}”
},
{
“目录路径”:“#{myOutputS3Loc}/#{format(@scheduledstartime,'YYYY-MM-dd-HH-MM-ss')”,
“名称”:“S3BackupLocation”,
“id”:“S3BackupLocation”,
“类型”:“S3DataNode”
},
{
“名称”:“EmrClusterForBackup”,
“coreInstanceCount”:“1”,
“coreInstanceType”:“m3.xlarge”,
“发布标签”:“emr-5.26.0”,
“masterInstanceType”:“m3.xlarge”,
“id”:“EmrClusterForBackup”,
“区域”:“{myDDBRegion}”,
“类型”:“EmrCluster”,
“结束时间”:“1小时”
}
],
“参数”:[
{
“说明”:“输出S3文件夹”,
“id”:“myOutputS3Loc”,
“类型”:“AWS::S3::ObjectKey”
},
{
“说明”:“源DynamoDB表名”,
“id”:“myDDBTableName”,
“类型”:“字符串”
},
{
“默认值”:“0.25”,
“水印”:“输入0.1-1.0之间的值”,
“说明”:“DynamoDB读取吞吐量比率”,
“id”:“MyDDBroadThroughPuttratio”,
“类型”:“双”
},
{
“默认值”:“us-east-1”,
“水印”:“us-east-1”,
“说明”:“DynamoDB表的区域”,
“id”:“myDDBRegion”,
“类型”:“字符串”
}
],
“价值观”:{
“myDDBRegion”:“us-west-2”,
“myDDBTableName”:“,
“MYDDBREADTHROUTPUTRATIO”:“0.5”,
“myOutputS3Loc”:“s3://”
}
}
主要变化:

  • EmrClusterForBackup
    的发布标签更新为“emr-5.26.0”。这是获得A的v1.11所需要的
    $ aws dynamodb update-table --table-name myTable --billing-mode PAY_PER_REQUEST
    
    {
      "objects": [
        {
          "output": {
            "ref": "S3BackupLocation"
          },
          "input": {
            "ref": "DDBSourceTable"
          },
          "maximumRetries": "2",
          "name": "TableBackupActivity",
          "step": "s3://<your-tools-bucket>/emr-dynamodb-tools-4.11.0-SNAPSHOT.jar,org.apache.hadoop.dynamodb.tools.DynamoDBExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}",
          "id": "TableBackupActivity",
          "runsOn": {
            "ref": "EmrClusterForBackup"
          },
          "type": "EmrActivity",
          "resizeClusterBeforeRunning": "true"
        },
        {
          "failureAndRerunMode": "CASCADE",
          "resourceRole": "DataPipelineDefaultResourceRole",
          "role": "DataPipelineDefaultRole",
          "pipelineLogUri": "s3://<your-log-bucket>/",
          "scheduleType": "ONDEMAND",
          "name": "Default",
          "id": "Default"
        },
        {
          "readThroughputPercent": "#{myDDBReadThroughputRatio}",
          "name": "DDBSourceTable",
          "id": "DDBSourceTable",
          "type": "DynamoDBDataNode",
          "tableName": "#{myDDBTableName}"
        },
        {
          "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
          "name": "S3BackupLocation",
          "id": "S3BackupLocation",
          "type": "S3DataNode"
        },
        {
          "name": "EmrClusterForBackup",
          "coreInstanceCount": "1",
          "coreInstanceType": "m3.xlarge",
          "releaseLabel": "emr-5.26.0",
          "masterInstanceType": "m3.xlarge",
          "id": "EmrClusterForBackup",
          "region": "#{myDDBRegion}",
          "type": "EmrCluster",
          "terminateAfter": "1 Hour"
        }
      ],
      "parameters": [
        {
          "description": "Output S3 folder",
          "id": "myOutputS3Loc",
          "type": "AWS::S3::ObjectKey"
        },
        {
          "description": "Source DynamoDB table name",
          "id": "myDDBTableName",
          "type": "String"
        },
        {
          "default": "0.25",
          "watermark": "Enter value between 0.1-1.0",
          "description": "DynamoDB read throughput ratio",
          "id": "myDDBReadThroughputRatio",
          "type": "Double"
        },
        {
          "default": "us-east-1",
          "watermark": "us-east-1",
          "description": "Region of the DynamoDB table",
          "id": "myDDBRegion",
          "type": "String"
        }
      ],
      "values": {
        "myDDBRegion": "us-west-2",
        "myDDBTableName": "<your table name>",
        "myDDBReadThroughputRatio": "0.5",
        "myOutputS3Loc": "s3://<your-output-bucket>/"
      }
    }