Google cloud dataflow 云数据流-数据流如何实现并行性?
我的问题是,在幕后,对于基于元素的Beam-DoFn(ParDo),云数据流如何并行工作负载?例如,在我的ParDO中,我为一个元素向外部服务器发送一个http请求。我用了30个工人,每个人有4vCPUGoogle cloud dataflow 云数据流-数据流如何实现并行性?,google-cloud-dataflow,apache-beam,Google Cloud Dataflow,Apache Beam,我的问题是,在幕后,对于基于元素的Beam-DoFn(ParDo),云数据流如何并行工作负载?例如,在我的ParDO中,我为一个元素向外部服务器发送一个http请求。我用了30个工人,每个人有4vCPU 这是否意味着每个工作线程最多有4个线程 这是否意味着每个工作者只需要4个http连接,或者如果我让它们保持活动状态以获得最佳性能,则可以建立4个http连接 除了使用更多内核或更多工作线程,我如何调整并行级别 使用我当前的设置(30*4vCPU工作进程),我可以在http服务器上建立大约120个
public class NewCallServerDoFn extends DoFn<PreparedRequest,KV<PreparedRequest,String>> {
private static final Logger Logger = LoggerFactory.getLogger(ProcessReponseDoFn.class);
private static PoolingHttpClientConnectionManager _ConnManager = null;
private static CloseableHttpClient _HttpClient = null;
private static HttpRequestRetryHandler _RetryHandler = null;
private static String[] _MapServers = MapServerBatchBeamApplication.CONFIG.getString("mapserver.client.config.server_host").split(",");
@Setup
public void setupHttpClient(){
Logger.info("Setting up HttpClient");
//Question: the value of maxConnection below is actually 10, but with 30 worker machines, I can only see 115 TCP connections established on the server side. So this setting doesn't really take effect as I expected.....
int maxConnection = MapServerBatchBeamApplication.CONFIG.getInt("mapserver.client.config.max_connection");
int timeout = MapServerBatchBeamApplication.CONFIG.getInt("mapserver.client.config.timeout");
_ConnManager = new PoolingHttpClientConnectionManager();
for (String mapServer : _MapServers) {
HttpHost serverHost = new HttpHost(mapServer,80);
_ConnManager.setMaxPerRoute(new HttpRoute(serverHost),maxConnection);
}
// config timeout
RequestConfig requestConfig = RequestConfig.custom()
.setConnectTimeout(timeout)
.setConnectionRequestTimeout(timeout)
.setSocketTimeout(timeout).build();
// config retry
_RetryHandler = new HttpRequestRetryHandler() {
public boolean retryRequest(
IOException exception,
int executionCount,
HttpContext context) {
Logger.info(exception.toString());
Logger.info("try request: " + executionCount);
if (executionCount >= 5) {
// Do not retry if over max retry count
return false;
}
if (exception instanceof InterruptedIOException) {
// Timeout
return false;
}
if (exception instanceof UnknownHostException) {
// Unknown host
return false;
}
if (exception instanceof ConnectTimeoutException) {
// Connection refused
return false;
}
if (exception instanceof SSLException) {
// SSL handshake exception
return false;
}
return true;
}
};
_HttpClient = HttpClients.custom()
.setConnectionManager(_ConnManager)
.setDefaultRequestConfig(requestConfig)
.setRetryHandler(_RetryHandler)
.build();
Logger.info("Setting up HttpClient is done.");
}
@Teardown
public void tearDown(){
Logger.info("Tearing down HttpClient and Connection Manager.");
try {
_HttpClient.close();
_ConnManager.close();
}catch (Exception e){
Logger.warn(e.toString());
}
Logger.info("HttpClient and Connection Manager have been teared down.");
}
@ProcessElement
public void processElement(ProcessContext c) {
PreparedRequest request = c.element();
if(request == null)
return;
String response="{\"my_error\":\"failed to get response from map server with retries\"}";
String chosenServer = _MapServers[request.getHardwareId() % _MapServers.length];
String parameter;
try {
parameter = URLEncoder.encode(request.getRequest(),"UTF-8");
} catch (UnsupportedEncodingException e) {
Logger.error(e.toString());
return;
}
StringBuilder sb = new StringBuilder().append(MapServerBatchBeamApplication.CONFIG.getString("mapserver.client.config.api_path"))
.append("?coordinates=")
.append(parameter);
HttpGet getRequest = new HttpGet(sb.toString());
HttpHost host = new HttpHost(chosenServer,80,"http");
CloseableHttpResponse httpRes;
try {
httpRes = _HttpClient.execute(host,getRequest);
HttpEntity entity = httpRes.getEntity();
if(entity != null){
try
{
response = EntityUtils.toString(entity);
}finally{
EntityUtils.consume(entity);
httpRes.close();
}
}
}catch(Exception e){
Logger.warn("failed by get response from map server with retries for " + request.getRequest());
}
c.output(KV.of(request, response));
}
}
public类NewCallServerDoFn扩展了DoFn{
私有静态最终记录器Logger=LoggerFactory.getLogger(ProcessReponseDoFn.class);
私有静态池httpclientconnectionmanager_ConnManager=null;
私有静态CloseableHttpClient _HttpClient=null;
私有静态HttpRequestRetryHandler\u RetryHandler=null;
私有静态字符串[]\u MapServers=MapServerBatchBeamApplication.CONFIG.getString(“mapserver.client.CONFIG.server\u host”).split(“,”);
@设置
public void setupHttpClient(){
Logger.info(“设置HttpClient”);
//问:下面maxConnection的值实际上是10,但对于30台工作计算机,我只能看到服务器端建立了115个TCP连接。因此,此设置并没有像我预期的那样真正生效。。。。。
int maxConnection=MapServerBatchBeamApplication.CONFIG.getInt(“mapserver.client.CONFIG.max_connection”);
int timeout=MapServerBatchBeamApplication.CONFIG.getInt(“mapserver.client.CONFIG.timeout”);
_ConnManager=new-poollighttpclientconnectionmanager();
对于(字符串映射服务器:\ u映射服务器){
HttpHost serverHost=新的HttpHost(mapServer,80);
_setMaxPerRoute(新的HttpRoute(服务器主机),maxConnection);
}
//配置超时
RequestConfig RequestConfig=RequestConfig.custom()
.setConnectTimeout(超时)
.setConnectionRequestTimeout(超时)
.setSocketTimeout(超时).build();
//配置重试
_RetryHandler=new-HttpRequestRetryHandler(){
公共布尔返回请求(
IOException异常,
int executionCount,
HttpContext(上下文){
Logger.info(exception.toString());
info(“try请求:+executionCount”);
如果(执行次数>=5){
//如果超过最大重试次数,请不要重试
返回false;
}
if(中断的异常实例EDIoException){
//超时
返回false;
}
if(未知后异常的异常实例){
//未知宿主
返回false;
}
if(ConnectTimeoutException的异常实例){
//拒绝连接
返回false;
}
if(SSLexException的异常实例){
//SSL握手异常
返回false;
}
返回true;
}
};
_HttpClient=HttpClients.custom()
.setConnectionManager(_-ConnManager)
.setDefaultRequestConfig(requestConfig)
.setRetryHandler(_RetryHandler)
.build();
info(“设置HttpClient已完成”);
}
@拆卸
公共无效拆卸(){
info(“拆掉HttpClient和连接管理器”);
试一试{
_HttpClient.close();
_ConnManager.close();
}捕获(例外e){
Logger.warn(例如toString());
}
info(“HttpClient和连接管理器已被删除”);
}
@过程元素
公共void processElement(ProcessContext c){
PreparedRequest请求=c.element();
if(请求==null)
返回;
String response=“{\”我的错误\“:\”无法通过重试\“}”从映射服务器获取响应;
字符串chosenServer=_-MapServers[request.getHardwareId()%_-MapServers.length];
字符串参数;
试一试{
参数=URLEncoder.encode(request.getRequest(),“UTF-8”);
}捕获(不支持的编码异常e){
Logger.error(例如toString());
返回;
}
StringBuilder sb=new StringBuilder().append(MapServerBatchBeamApplication.CONFIG.getString(“mapserver.client.CONFIG.api_path”))
.append(“?坐标=”)
.append(参数);
HttpGet getRequest=newhttpget(sb.toString());
HttpHost主机=新的HttpHost(chosenServer,80,“http”);
可关闭的httpresponse httpresponse;
试一试{
httpRes=_HttpClient.execute(主机,getRequest);
HttpEntity entity=httpRes.getEntity();
如果(实体!=null){
尝试
{
response=EntityUtils.toString(实体);
}最后{
EntityUtils.consume(实体);
httpRes.close();
}
}
}捕获(例外e){
Logger.warn(“从映射服务器获取响应并重试”+request.getRequest())失败);
}
c、 输出(请求、响应的千伏);
}
}
@ProcessElement
或@FinishBundle
中同步输出