Google cloud dataflow 云数据流-数据流如何实现并行性?

Google cloud dataflow 云数据流-数据流如何实现并行性?,google-cloud-dataflow,apache-beam,Google Cloud Dataflow,Apache Beam,我的问题是,在幕后,对于基于元素的Beam-DoFn(ParDo),云数据流如何并行工作负载?例如,在我的ParDO中,我为一个元素向外部服务器发送一个http请求。我用了30个工人,每个人有4vCPU 这是否意味着每个工作线程最多有4个线程 这是否意味着每个工作者只需要4个http连接,或者如果我让它们保持活动状态以获得最佳性能,则可以建立4个http连接 除了使用更多内核或更多工作线程,我如何调整并行级别 使用我当前的设置(30*4vCPU工作进程),我可以在http服务器上建立大约120个

我的问题是,在幕后,对于基于元素的Beam-DoFn(ParDo),云数据流如何并行工作负载?例如,在我的ParDO中,我为一个元素向外部服务器发送一个http请求。我用了30个工人,每个人有4vCPU

  • 这是否意味着每个工作线程最多有4个线程
  • 这是否意味着每个工作者只需要4个http连接,或者如果我让它们保持活动状态以获得最佳性能,则可以建立4个http连接
  • 除了使用更多内核或更多工作线程,我如何调整并行级别
  • 使用我当前的设置(30*4vCPU工作进程),我可以在http服务器上建立大约120个http连接。但服务器和工作服务器的资源使用率都非常低。基本上,我希望通过每秒发送更多的请求来让他们更加努力地工作。我该怎么办
  • 代码片段演示我的工作:

    public class NewCallServerDoFn extends DoFn<PreparedRequest,KV<PreparedRequest,String>> {
    
    
    private static final Logger Logger = LoggerFactory.getLogger(ProcessReponseDoFn.class);
    
    private static PoolingHttpClientConnectionManager _ConnManager = null;
    private static CloseableHttpClient _HttpClient = null;
    private static HttpRequestRetryHandler _RetryHandler = null;
    private static  String[] _MapServers = MapServerBatchBeamApplication.CONFIG.getString("mapserver.client.config.server_host").split(",");
    
    @Setup
    public void setupHttpClient(){
    
        Logger.info("Setting up HttpClient");
    
       //Question: the value of maxConnection below is actually 10, but with 30 worker machines, I can only see 115 TCP connections established on the server side. So this setting doesn't really take effect as I expected.....
    
        int maxConnection = MapServerBatchBeamApplication.CONFIG.getInt("mapserver.client.config.max_connection");
        int timeout = MapServerBatchBeamApplication.CONFIG.getInt("mapserver.client.config.timeout");
    
        _ConnManager = new PoolingHttpClientConnectionManager();
    
        for (String mapServer : _MapServers) {
            HttpHost serverHost = new HttpHost(mapServer,80);
            _ConnManager.setMaxPerRoute(new HttpRoute(serverHost),maxConnection);
        }
    
        // config timeout
        RequestConfig requestConfig = RequestConfig.custom()
                .setConnectTimeout(timeout)
                .setConnectionRequestTimeout(timeout)
                .setSocketTimeout(timeout).build();
    
        // config retry
        _RetryHandler = new HttpRequestRetryHandler() {
    
            public boolean retryRequest(
                    IOException exception,
                    int executionCount,
                    HttpContext context) {
    
                Logger.info(exception.toString());
                Logger.info("try request: " + executionCount);
    
                if (executionCount >= 5) {
                    // Do not retry if over max retry count
                    return false;
                }
                if (exception instanceof InterruptedIOException) {
                    // Timeout
                    return false;
                }
                if (exception instanceof UnknownHostException) {
                    // Unknown host
                    return false;
                }
                if (exception instanceof ConnectTimeoutException) {
                    // Connection refused
                    return false;
                }
                if (exception instanceof SSLException) {
                    // SSL handshake exception
                    return false;
                }
                return true;
            }
    
        };
    
        _HttpClient = HttpClients.custom()
                                .setConnectionManager(_ConnManager)
                                .setDefaultRequestConfig(requestConfig)
                                .setRetryHandler(_RetryHandler)
                                .build();
    
        Logger.info("Setting up HttpClient is done.");
    
    }
    
    @Teardown
    public void tearDown(){
        Logger.info("Tearing down HttpClient and Connection Manager.");
        try {
            _HttpClient.close();
            _ConnManager.close();
        }catch (Exception e){
            Logger.warn(e.toString());
        }
        Logger.info("HttpClient and Connection Manager have been teared down.");
    }
    
    
    
    
    @ProcessElement
    public void processElement(ProcessContext c) {
    
        PreparedRequest request = c.element();
    
        if(request == null)
            return;
    
        String response="{\"my_error\":\"failed to get response from map server with retries\"}";
    
    
        String chosenServer = _MapServers[request.getHardwareId() % _MapServers.length];
    
        String parameter;
        try {
            parameter = URLEncoder.encode(request.getRequest(),"UTF-8");
        } catch (UnsupportedEncodingException e) {
            Logger.error(e.toString());
    
            return;
        }
    
        StringBuilder sb = new StringBuilder().append(MapServerBatchBeamApplication.CONFIG.getString("mapserver.client.config.api_path"))
                .append("?coordinates=")
                .append(parameter);
    
        HttpGet getRequest = new HttpGet(sb.toString());
        HttpHost host = new HttpHost(chosenServer,80,"http");
        CloseableHttpResponse httpRes;
    
        try {
            httpRes = _HttpClient.execute(host,getRequest);
            HttpEntity entity = httpRes.getEntity();
            if(entity != null){
                try
                {
                    response = EntityUtils.toString(entity);
                }finally{
                    EntityUtils.consume(entity);
                    httpRes.close();
                }
            }
        }catch(Exception e){
            Logger.warn("failed by get response from map server with retries for " + request.getRequest());
        }
    
        c.output(KV.of(request, response));
    
    }
    }
    
    public类NewCallServerDoFn扩展了DoFn{
    私有静态最终记录器Logger=LoggerFactory.getLogger(ProcessReponseDoFn.class);
    私有静态池httpclientconnectionmanager_ConnManager=null;
    私有静态CloseableHttpClient _HttpClient=null;
    私有静态HttpRequestRetryHandler\u RetryHandler=null;
    私有静态字符串[]\u MapServers=MapServerBatchBeamApplication.CONFIG.getString(“mapserver.client.CONFIG.server\u host”).split(“,”);
    @设置
    public void setupHttpClient(){
    Logger.info(“设置HttpClient”);
    //问:下面maxConnection的值实际上是10,但对于30台工作计算机,我只能看到服务器端建立了115个TCP连接。因此,此设置并没有像我预期的那样真正生效。。。。。
    int maxConnection=MapServerBatchBeamApplication.CONFIG.getInt(“mapserver.client.CONFIG.max_connection”);
    int timeout=MapServerBatchBeamApplication.CONFIG.getInt(“mapserver.client.CONFIG.timeout”);
    _ConnManager=new-poollighttpclientconnectionmanager();
    对于(字符串映射服务器:\ u映射服务器){
    HttpHost serverHost=新的HttpHost(mapServer,80);
    _setMaxPerRoute(新的HttpRoute(服务器主机),maxConnection);
    }
    //配置超时
    RequestConfig RequestConfig=RequestConfig.custom()
    .setConnectTimeout(超时)
    .setConnectionRequestTimeout(超时)
    .setSocketTimeout(超时).build();
    //配置重试
    _RetryHandler=new-HttpRequestRetryHandler(){
    公共布尔返回请求(
    IOException异常,
    int executionCount,
    HttpContext(上下文){
    Logger.info(exception.toString());
    info(“try请求:+executionCount”);
    如果(执行次数>=5){
    //如果超过最大重试次数,请不要重试
    返回false;
    }
    if(中断的异常实例EDIoException){
    //超时
    返回false;
    }
    if(未知后异常的异常实例){
    //未知宿主
    返回false;
    }
    if(ConnectTimeoutException的异常实例){
    //拒绝连接
    返回false;
    }
    if(SSLexException的异常实例){
    //SSL握手异常
    返回false;
    }
    返回true;
    }
    };
    _HttpClient=HttpClients.custom()
    .setConnectionManager(_-ConnManager)
    .setDefaultRequestConfig(requestConfig)
    .setRetryHandler(_RetryHandler)
    .build();
    info(“设置HttpClient已完成”);
    }
    @拆卸
    公共无效拆卸(){
    info(“拆掉HttpClient和连接管理器”);
    试一试{
    _HttpClient.close();
    _ConnManager.close();
    }捕获(例外e){
    Logger.warn(例如toString());
    }
    info(“HttpClient和连接管理器已被删除”);
    }
    @过程元素
    公共void processElement(ProcessContext c){
    PreparedRequest请求=c.element();
    if(请求==null)
    返回;
    String response=“{\”我的错误\“:\”无法通过重试\“}”从映射服务器获取响应;
    字符串chosenServer=_-MapServers[request.getHardwareId()%_-MapServers.length];
    字符串参数;
    试一试{
    参数=URLEncoder.encode(request.getRequest(),“UTF-8”);
    }捕获(不支持的编码异常e){
    Logger.error(例如toString());
    返回;
    }
    StringBuilder sb=new StringBuilder().append(MapServerBatchBeamApplication.CONFIG.getString(“mapserver.client.CONFIG.api_path”))
    .append(“?坐标=”)
    .append(参数);
    HttpGet getRequest=newhttpget(sb.toString());
    HttpHost主机=新的HttpHost(chosenServer,80,“http”);
    可关闭的httpresponse httpresponse;
    试一试{
    httpRes=_HttpClient.execute(主机,getRequest);
    HttpEntity entity=httpRes.getEntity();
    如果(实体!=null){
    尝试
    {
    response=EntityUtils.toString(实体);
    }最后{
    EntityUtils.consume(实体);
    httpRes.close();
    }
    }
    }捕获(例外e){
    Logger.warn(“从映射服务器获取响应并重试”+request.getRequest())失败);
    }
    c、 输出(请求、响应的千伏);
    }
    }
    
  • 是的,基于这个
  • 不,您可以建立更多连接。基于我的,您可以使用异步http客户机来拥有更多并发请求。正如这个答案所描述的,您需要从这些异步调用中收集结果,并在任何
    @ProcessElement
    @FinishBundle
    中同步输出
  • 见第2条
  • 由于您的资源使用率较低,这表明工作人员将大部分时间花在工作上