Multithreading 在节点模块中使用集群

Multithreading 在节点模块中使用集群,multithreading,node.js,cluster-computing,Multithreading,Node.js,Cluster Computing,更新:根据评论,即使这个特定场景不现实,我仍然对如何编写一个模块来利用集群而不必每次重新运行父进程感兴趣 我正在尝试编写一个名为mass request的Node.js模块,它通过将大量HTTP请求分发到子进程来加速这些请求 我的希望是,在外部,它是这样工作的 var mr = require("mass-request"), scraper = mr(); for (var i = 0; i < my_urls_to_visit.length; i += 1) { s

更新:根据评论,即使这个特定场景不现实,我仍然对如何编写一个模块来利用集群而不必每次重新运行父进程感兴趣


我正在尝试编写一个名为
mass request
的Node.js模块,它通过将大量HTTP请求分发到子进程来加速这些请求

我的希望是,在外部,它是这样工作的

var mr = require("mass-request"),
    scraper = mr();

for (var i = 0; i < my_urls_to_visit.length; i += 1) {
    scraper.add(my_urls_to_visit[i], function(resp) {
        // do something with response
    }
}
我希望看到“HellofromMassRequest!”被记录了四次(实际上是这样)。令我惊讶的是,我还四次看到“hello from test.js”。显然,我不明白
cluster.fork()
是如何工作的。是不是重新运行整个过程,而不仅仅是第一次调用它的函数


如果是这样的话,如何在模块中使用集群而不让使用该模块的人感到混乱的多进程逻辑

虽然node.js的异步特性确实让它很棒,但它仍然在服务器上的单个线程中以单个事件循环运行。使用集群多线程处理node.js应用程序可以将应用程序的子进程分叉到它们自己的线程中,从而更好地利用多核服务器。不久前,我构建了一个游戏服务器体系结构,它使用集群和zmq(ZeroMQ)实现多线程,并使进程能够轻松地通过各种通道来回发送消息。我已将该体系结构简化为下面的示例,希望有助于说明如何将多线程node.js组合在一起。如果有点粗糙,我很抱歉,那是几年前的事了,当时我对node还比较陌生;)

理想情况下,您不希望在单个脚本中嵌套主/子脚本的所有内容,但我认为这是允许您复制/粘贴/运行的最简单方法:)

正如您在评论中提到的,我给出了一个很好的集群示例,但并不是一个适合您的特定用例的示例,因为它可以将所有内容分发给您。我没有太多的时间,所以我修改了我的示例,使其能够很快满足您的需要。试一试:

mass request.js

var cluster = require('cluster');
var zmq = require('zmq');

module.exports = {
    _childId : null,
    _urls : [],
    _threadCount : 1,
    _readyThreads : 0,
    _callbacks : {},
    zmqReceive : null, //the socket we receive on for this thread
    zmqMaster : null, //the socket to the master
    zmqChildren : {}, //an object storing the sockets for the children
    setThreads : function( threadCount ) {
        this._threadCount = threadCount;
    },
    add : function( url , cb ) {
        this._urls.push( {url: url, cb : cb } );
    },
    run : function() {

        if( cluster.isMaster ) {

            this._masterThread();

        } else {

            this._childThread();

        }

    },
    _masterThread : function() {

        console.log( 'Master Process Starting Up' );

        this.zmqReceive = zmq.socket('pull').bindSync( 'ipc://master.ipc' );

        //bind handler for messages coming into this process using closure to allow us to access the massrequest object inside the callback
        ( function( massRequest ) {
            this.zmqReceive.on( 'message' , function( msg ) {

                msg = JSON.parse(msg);

                //was this an online notification?
                if( msg && msg.status == 'Online' ) {
                    massRequest._threadReady();
                    return; //we're done
                }
                if( msg && msg.html ) {
                    //this was a response from a child, call the callback for it
                    massRequest._callbacks[ msg.sender ].call( massRequest , msg.html );
                    //send the child another URL
                    massRequest._sendUrlToChild( msg.sender );
                }

            } );
        }).call( this , this );

        //fork 4 child processes and set up the sending sockets for them
        for( var i=0; i < this._threadCount; ++i ) {
            //set up the sending socket
            this.zmqChildren[i] = zmq.socket('push').connect( 'ipc://child_' + i + '.ipc' );
            //fork the process and pass it an id
            cluster.fork( {
                _childId:i
            } );
        }

    },
    _sendUrlToChild : function( child ) {
        //if there's no urls left, return (this would also be a good place to send a message to the child to exit gracefully)
        if( !this._urls.length ) return;
        //grab a url to process
        var item = this._urls.pop();
        //set the callback for the child
        this._callbacks[child] = item.cb;
        this.zmqChildren[child].send( JSON.stringify( { url:item.url } ) );
    },
    _processUrls : function() {
        for( var i=0; i < this._threadCount; ++i ) {
            this._sendUrlToChild( i );
        }
    },
    _threadReady : function() {
        if( ++this._readyThreads >= this._threadCount ) {
            //all threads are ready, send out urls to start the mayhem
            console.log( 'All threads online, starting URL processing' );
            this._processUrls();
        }
    },
    _childProcessUrl : function( url ) {
        console.log( 'Child Process ' + this.childId + ' Handling URL: ' + url );
        //do something here to scrape your content however you see fit
        var html = 'HTML';
        this.zmqMaster.send( JSON.stringify( { sender:this.childId, html:html } ) );
    },
    _childThread : function() {

        //get the child id that was passed from cluster
        this.childId = process.env._childId;

        console.log( 'Child Process ' + this.childId + ' Starting Up' );

        //bind the pull socket to receive messages to this process
        this.zmqReceive = zmq.socket('pull').bindSync( 'ipc://child_' + this.childId + '.ipc' );

        //bind the push socket to send to the master
        this.zmqMaster = zmq.socket('push').connect('ipc://master.ipc');

        //bind handler for messages coming into this process
        ( function( massRequest ) {
            this.zmqReceive.on( 'message' , function( msg ) {

                msg = JSON.parse(msg);

                console.log( 'Child ' + this.childId + ': ' + msg );

                //handle the url
                if( msg && msg.url ) massRequest._childProcessUrl( msg.url );

            } );
        }).call( this , this );

        //let the master know we're done setting up
        this.zmqMaster.send( JSON.stringify({sender:this.childId,status:'Online'}) );

    },
}
var mr = require( './mass-request.js' );
mr.setThreads( 4 );
mr.add( 'http://foo.com' , function( resp ) {
    console.log( 'http://foo.com is done' );
} );
mr.add( 'http://bar.com' , function( resp ) {
    console.log( 'http://bar.com is done' );
} );
mr.add( 'http://alpha.com' , function( resp ) {
    console.log( 'http://alpha.com is done' );
} );
mr.add( 'http://beta.com' , function( resp ) {
    console.log( 'http://beta.com is done' );
} );
mr.add( 'http://theta.com' , function( resp ) {
    console.log( 'http://theta.com is done' );
} );
mr.add( 'http://apples.com' , function( resp ) {
    console.log( 'http://apples.com is done' );
} );
mr.add( 'http://oranges.com' , function( resp ) {
    console.log( 'http://oranges.com is done' );
} );
mr.run();
var cluster = require("cluster"),
  path = require("path"),
  numCPUs = require("os").cpus().length;

console.log("hello from mass-request!");
if (cluster.isMaster) {
  cluster.setupMaster({
    exec: path.join(__dirname, 'worker.js')
  });

  for (var i = 0; i < numCPUs; i += 1) {
    var worker = cluster.fork();
  }

  return {
    add: function (url, cb) {
    }
  }
} else {
  console.log("worker " + process.pid + " is born!");
}
console.log("worker " + process.pid + " is born!");
将它们放在同一文件夹中,然后运行
node demo.js

我还应该指出,由于这个基础是从我的另一个使用[0MQ][的项目中提取出来的,因此您需要将它与[node.js模块一起安装[
npm安装zmq
显然还有集群模块。当然,你可以将zmq部件换成你想要的任何其他进程间通信方法。这恰好是我熟悉并使用过的方法

概述:主线程(也称为调用run()方法的脚本)将启动X个子线程(可以通过调用setThreads进行设置)。完成初始化后,这些子线程通过ZeroMQ套接字向主线程报告。一旦所有线程就绪,主脚本将URL分派给子线程,以便它们可以运行并获取HTML。它们将HTML返回给主线程,并将其传递给该URL的相应回调函数,然后分派他是另一个指向子脚本的URL。虽然这不是一个完美的解决方案,但回调函数在主(主)脚本中仍然会遇到瓶颈线程,因为您不能轻松地将它们转移到另一个线程。这些回调可能包含闭包/变量等,如果没有某种对象共享机制,它们可能无法在父线程之外正常工作

不管是谁,如果你在这里启动我的小演示,你会看到4个线程“处理”URL(为了简单起见,它们实际上并不加载URL)


希望这能有所帮助;)

我相信你要找的是

从文档中:

cluster.setupMaster([settings])

  • 设置对象
    • 工作文件的exec字符串文件路径。(默认值=process.argv[1])
    • args数组字符串参数传递给worker。(默认值=process.argv.slice(2))
    • 无提示布尔值是否将输出发送到父级的stdio。(默认值=false)
setupMaster用于更改默认的“fork”行为。调用后,设置将显示在cluster.settings中

通过使用exec属性,可以从不同的模块启动工作人员

重要提示:根据文档的状态,这只能调用一次。如果您的模块依赖于此行为,则调用者不能使用
集群
,否则整个模块将崩溃

例如:

index.js

var cluster = require('cluster');
var zmq = require('zmq');

module.exports = {
    _childId : null,
    _urls : [],
    _threadCount : 1,
    _readyThreads : 0,
    _callbacks : {},
    zmqReceive : null, //the socket we receive on for this thread
    zmqMaster : null, //the socket to the master
    zmqChildren : {}, //an object storing the sockets for the children
    setThreads : function( threadCount ) {
        this._threadCount = threadCount;
    },
    add : function( url , cb ) {
        this._urls.push( {url: url, cb : cb } );
    },
    run : function() {

        if( cluster.isMaster ) {

            this._masterThread();

        } else {

            this._childThread();

        }

    },
    _masterThread : function() {

        console.log( 'Master Process Starting Up' );

        this.zmqReceive = zmq.socket('pull').bindSync( 'ipc://master.ipc' );

        //bind handler for messages coming into this process using closure to allow us to access the massrequest object inside the callback
        ( function( massRequest ) {
            this.zmqReceive.on( 'message' , function( msg ) {

                msg = JSON.parse(msg);

                //was this an online notification?
                if( msg && msg.status == 'Online' ) {
                    massRequest._threadReady();
                    return; //we're done
                }
                if( msg && msg.html ) {
                    //this was a response from a child, call the callback for it
                    massRequest._callbacks[ msg.sender ].call( massRequest , msg.html );
                    //send the child another URL
                    massRequest._sendUrlToChild( msg.sender );
                }

            } );
        }).call( this , this );

        //fork 4 child processes and set up the sending sockets for them
        for( var i=0; i < this._threadCount; ++i ) {
            //set up the sending socket
            this.zmqChildren[i] = zmq.socket('push').connect( 'ipc://child_' + i + '.ipc' );
            //fork the process and pass it an id
            cluster.fork( {
                _childId:i
            } );
        }

    },
    _sendUrlToChild : function( child ) {
        //if there's no urls left, return (this would also be a good place to send a message to the child to exit gracefully)
        if( !this._urls.length ) return;
        //grab a url to process
        var item = this._urls.pop();
        //set the callback for the child
        this._callbacks[child] = item.cb;
        this.zmqChildren[child].send( JSON.stringify( { url:item.url } ) );
    },
    _processUrls : function() {
        for( var i=0; i < this._threadCount; ++i ) {
            this._sendUrlToChild( i );
        }
    },
    _threadReady : function() {
        if( ++this._readyThreads >= this._threadCount ) {
            //all threads are ready, send out urls to start the mayhem
            console.log( 'All threads online, starting URL processing' );
            this._processUrls();
        }
    },
    _childProcessUrl : function( url ) {
        console.log( 'Child Process ' + this.childId + ' Handling URL: ' + url );
        //do something here to scrape your content however you see fit
        var html = 'HTML';
        this.zmqMaster.send( JSON.stringify( { sender:this.childId, html:html } ) );
    },
    _childThread : function() {

        //get the child id that was passed from cluster
        this.childId = process.env._childId;

        console.log( 'Child Process ' + this.childId + ' Starting Up' );

        //bind the pull socket to receive messages to this process
        this.zmqReceive = zmq.socket('pull').bindSync( 'ipc://child_' + this.childId + '.ipc' );

        //bind the push socket to send to the master
        this.zmqMaster = zmq.socket('push').connect('ipc://master.ipc');

        //bind handler for messages coming into this process
        ( function( massRequest ) {
            this.zmqReceive.on( 'message' , function( msg ) {

                msg = JSON.parse(msg);

                console.log( 'Child ' + this.childId + ': ' + msg );

                //handle the url
                if( msg && msg.url ) massRequest._childProcessUrl( msg.url );

            } );
        }).call( this , this );

        //let the master know we're done setting up
        this.zmqMaster.send( JSON.stringify({sender:this.childId,status:'Online'}) );

    },
}
var mr = require( './mass-request.js' );
mr.setThreads( 4 );
mr.add( 'http://foo.com' , function( resp ) {
    console.log( 'http://foo.com is done' );
} );
mr.add( 'http://bar.com' , function( resp ) {
    console.log( 'http://bar.com is done' );
} );
mr.add( 'http://alpha.com' , function( resp ) {
    console.log( 'http://alpha.com is done' );
} );
mr.add( 'http://beta.com' , function( resp ) {
    console.log( 'http://beta.com is done' );
} );
mr.add( 'http://theta.com' , function( resp ) {
    console.log( 'http://theta.com is done' );
} );
mr.add( 'http://apples.com' , function( resp ) {
    console.log( 'http://apples.com is done' );
} );
mr.add( 'http://oranges.com' , function( resp ) {
    console.log( 'http://oranges.com is done' );
} );
mr.run();
var cluster = require("cluster"),
  path = require("path"),
  numCPUs = require("os").cpus().length;

console.log("hello from mass-request!");
if (cluster.isMaster) {
  cluster.setupMaster({
    exec: path.join(__dirname, 'worker.js')
  });

  for (var i = 0; i < numCPUs; i += 1) {
    var worker = cluster.fork();
  }

  return {
    add: function (url, cb) {
    }
  }
} else {
  console.log("worker " + process.pid + " is born!");
}
console.log("worker " + process.pid + " is born!");
输出

node index.js 
hello from mass-request!
worker 38821 is born!
worker 38820 is born!
worker 38822 is born!
worker 38819 is born!

在子进程js线程中运行请求会有什么帮助?Http请求已经存在于js线程之外。请参阅有趣的内容。因此,让两个或多个进程分担许多URL调用的工作不会加快进程?一个线程进行所有调用,另一个线程处理响应如何?使用多个URL调用的唯一原因是如果js线程是瓶颈,那么js线程就是瓶颈。考虑到node.js的异步性质,当io也出现在图片中时,很少会出现这种情况。因此,只有在进行加密等cpu密集型工作时,将处理分给子线程才有意义。Mozilla persona就是一个很好的例子。请注意,传统上,fork()创建整个进程的副本。它不会“重新运行”整个进程,但会复制内存,并从返回fork()开始在两个进程中继续执行@generalhenry您提供的链接并不支持您最初的说法,即http stuff运行在一个独立的线程上,而不是在主线程上,您对此有更多的信息吗?谢谢!这是群集的一个很好的例子。不过,这不是我真正的问题。问题在于如何在一个模块中执行这类操作,而该模块需要使用不同的脚本,例如父脚本不必担心检查它是否是主脚本。请参阅origin post中的代码示例。干杯!@Ch