使用curl语法获取单页应用程序的生成源代码?
我一直在命令行中使用curl来请求url并解析其标记 对于经过身份验证的页面,我很容易做到这一点,方法是进入Chrome,加载url,然后打开inspector,在网络历史记录的顶部找到url,右键单击它,然后选择使用curl语法获取单页应用程序的生成源代码?,curl,single-page-application,Curl,Single Page Application,我一直在命令行中使用curl来请求url并解析其标记 对于经过身份验证的页面,我很容易做到这一点,方法是进入Chrome,加载url,然后打开inspector,在网络历史记录的顶部找到url,右键单击它,然后选择Copy | Copy as Curl 我想对一个单页应用程序做同样的事情,它当然会运行大量的其他东西来呈现自己,比如javascript,或者其他任何东西 有没有工具可以让我轻松地将“curl”更改为其他内容,并下载生成的页面源代码 e、 g.通常,如果不是单页应用程序(从Chrom
Copy | Copy as Curl
我想对一个单页应用程序做同样的事情,它当然会运行大量的其他东西来呈现自己,比如javascript,或者其他任何东西
有没有工具可以让我轻松地将“curl”更改为其他内容,并下载生成的页面源代码
e、 g.通常,如果不是单页应用程序(从Chrome复制),我会运行此程序以获取经过身份验证的页面的源代码
我想把它转换成其他的东西,它接收所有的头,最好是与curl完全相同的语法,并给我生成的源代码
downloadGeneratedSource 'https://mywebsite.com/singlePageApplication' \
-H 'Connection: keep-alive' \
-H 'Pragma: no-cache' \
-H 'Cache-Control: no-cache' \
-H 'Accept-Language: en,en-US;q=0.9' \
-H 'Cookie: session=XXX"
这在任何地方都存在吗?正如和在他们的评论中指出的,或者是设计用于模拟浏览用户行为的奇特工具,从而允许您以轻松配置的方式下载(SPA)的源代码
另一方面,如果在脚本中使用,您可以做类似的事情,这是对的。在21世纪初,我与grep、awk、sed和perl一起使用,通过使用创建的动态url自动下载访问控制页面。这确实是一个与现在的温泉非常相似的场景
我选择了wget
而不是curl
,因为管道处理它的输出更容易,但是有必要根据您的特定用例定制这样的脚本。如果你能流利地使用,那就需要几分钟的时间,因为目标URL有一些我可以查找的语法-也许你也可以这样做
进一步阅读
- -两种工具的良好(历史)比较
- Python方法是否用于站点刮取
var argIs,getArg,d;
var customHeaders = {};
// grab the "rendered" HTML of a JavaScript-requiring web page
// TBD:
// add '-' as magic filename to read from STDIN
// add more curl-like switches? --or just let curl do that and consume the output of curl?
// add a switch for page.render( URLdump); // "screenshot"
var system = require('system'); // var args = require('system').args;
var page = require('webpage').create();
if (system.args.length === 1) {
console.log('Usage: curl-phantom.js <http://URL/path/file.ext>');
console.log(system.args);
// note: can also read "pages" from the local filesystem
phantom.exit();
};
var URLarg=system.args[1];
var theStatusCode = null;
var theStatusPrev = null;
var thePrevURL = '' ;
var theCurrURL = '' ;
var timestamp = Date.now();
var verbose = false;
var debug = true;
var full_page = false;
var header_key = 'X-Forwarded-For';
var header_val = '3.1.20.13';
var requestTimeout= 5000; // Default request timeout
argIs = function(i, name){
if (system.args[i].indexOf(name) == 0 ) {
return true;
}
return false;
}
getArg = function(i) {
return system.args[i].trim();
}
v = function(a,b) {
verbose && console.log(a,b)
}
d = function(a,b) {
debug && console.log(a,b)
}
for (var i=1; i<system.args.length; i++) {
if (argIs(i, '--debug')) {
debug = true;
d('DEBUG: ' + getArg(i));
}
else if (argIs(i, '--full_page')) {
full_page = true;
d('PAGE: ' + getArg(i));
}
else if (argIs(i, '-H', '--header')) {
var arg = getArg(++i);
var arr = arg.trim().split(/\s*:\s*/);
var header = {};
var key = arr[0];
var value = (arr.length == 2) ? arr[1] : '';
customHeaders[key] = value;
d('HEADER:', [key, value]);
}
else if (argIs(i, '--verbose')) {
verbose = true;
v('VERBOSE: ' + getArg(i));
}
else if (argIs(i, '--timeout')) {
requestTimeout = getArg(++i);
d('REQUEST_TIMEOUT', requestTimeout);
}
else {
console.log('unknown param: '+getArg(i));
}
}
console.log('################');
console.log('headers and values');
console.log(JSON.stringify(customHeaders));
page.settings.resourceTimeout = requestTimeout;
page.customHeaders = customHeaders;
//page.customHeaders = { header_key : header_val };
v('VERBOSE: ' + header_key +': '+ header_val);
page.onConsoleMessage = function (msg) { // call-back function intercepts console.log messages
d('DEBUG: console.log message="' + msg + '"');
};
page.onLoadFinished = function(status) {
if ( debug ) {
// console.log('Status: ' + status +' after onLoadFinished(' + status +')');
system.stderr.write('OnLoadFinished.Status: ' + (theStatusCode ? theStatusCode : status) +' after onLoadFinished(' + status +')\n');
}
};
page.onResourceReceived = function(resource) {
// if (resource.url == URLarg || (theStatusCode >= 300 && theStatusCode < 400)) {
theStatusPrev = theStatusCode ;
theStatusCode = resource.status;
thePrevURL = theCurrURL ;
theCurrURL = resource.url;
// }
if ( resource.status === 200 ) {
v('VERBOSE status ' + resource.status + ' for ' + resource.url ); // don't usually log standard success
} else {
v('Status Code was: ' + theStatusPrev + ' for ' + thePrevURL );
v('Status Code is : ' + theStatusCode + ' for ' + theCurrURL );
}
};
page.onUrlChanged = function (URLnew) { // call-back function intercepts console.log messages
if ( URLnew === URLarg ) {
d('DEBUG: old/new URL: ' + URLnew + ' --onUrlChanged()');
} else {
v('DEBUG: old URL: ' + URLarg);
v('DEBUG: new URL: ' + URLnew);
}
};
phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function + ')' : ''));
});
}
console.error(msgStack.join('\n'));
};
page.onResourceTimeout = function(request) {
console.error('Request timed out due to ' + request.errorCode + ' - ' + request.errorString);
phantom.exit(1);
}
page.open( URLarg, function () {
// onLoadFinished executes here
var page_content = page.content;
var body_innerHTML= page.evaluate( function() {
return document.body.innerHTML ? document.body.innerHTML : '(empty)' ;
});
var title = page.evaluate(function() {return document.title; });
// page.render( URLdump); // "screenshot"
v('VERBOSE: Loading time '+ ( Date.now() - timestamp ) +' msec');
d('DEBUG: Page title: ' + ((title==='') ? '(none)':title) );
d('DEBUG: body_innerHTML.length='+ body_innerHTML.length);
d(' ');
if ( full_page || ( ! body_innerHTML ) || body_innerHTML.length < 9 ) {
console.log( page_content ); // return all if body is empty
} else {
console.log( body_innerHTML );
}
setTimeout(function() {
v('VERBOSE: status ' + theStatusPrev + ' for ' + thePrevURL + ' (b)');
v('VERBOSE: status ' + theStatusCode + ' for ' + theCurrURL + ' (c)');
}, 1333 ) ; // delay in milliseconds
phantom.exit( theStatusCode);
}) ;
var argIs,getArg,d;
var customHeaders={};
//抓取需要JavaScript的网页的“呈现”HTML
//待定:
//添加“-”作为从标准输入读取的魔法文件名
//添加更多类似卷曲的开关--或者让curl这样做并消耗curl的输出?
//为page.render(URLdump)添加开关;//“截图”
var system=require('system');//var args=require('system')。args;
var page=require('webpage')。create();
if(system.args.length==1){
log('用法:curl-phantom.js');
console.log(system.args);
//注意:还可以从本地文件系统读取“页面”
phantom.exit();
};
var URLarg=system.args[1];
var theStatusCode=null;
var theStatusPrev=null;
var thePrevURL='';
var thecurrull='';
var timestamp=Date.now();
var verbose=false;
var debug=true;
var full_page=false;
var header_key='X-Forwarded-For';
风险值标题_val='3.1.20.13';
var requestTimeout=5000;//默认请求超时
argIs=函数(i,名称){
if(system.args[i].indexOf(name)==0){
返回true;
}
返回false;
}
getArg=函数(i){
返回系统.args[i].trim();
}
v=功能(a,b){
verbose&&console.log(a,b)
}
d=功能(a,b){
调试和控制台日志(a、b)
}
对于(变量i=1;i=300&&statuscode<400)){
statusprev=statuscode;
statuscode=resource.status;
thePrevURL=当前URL;
currURL=resource.url;
// }
如果(resource.status==200){
v('VERBOSE status'+resource.status+'表示'+resource.url);//通常不记录标准成功
}否则{
v('状态代码为:'+theStatusPrev+'表示'+thePrevURL');
v('状态代码为:'+theStatusCode+'表示'+theCurrURL');
}
};
page.onUrlChanged=函数(URLnew){//回调函数截获console.log消息
如果(URLnew==URLarg){
d('DEBUG:old/newurl:'+URLnew+'--onUrlChanged()');
}否则{
v('DEBUG:old URL:'+URLarg);
v('DEBUG:newurl:'+URLnew);
}
};
phantom.onError=函数(消息,跟踪){
var msgStack=['PHANTOM ERROR:'+msg];
如果(跟踪){
msgStack.push('TRACE:');
trace.forEach(函数(t){
msgStack.push('->'+(t.file | | t.sourceURL)+':'+t.line+(t.function?'(在函数'+t.function+'):'');
});
}
console.error(msgStack.join('\n'));
};
page.onResourceTimeout=函数(请求){
console.error('请求因'+Request.errorCode+'-'+Request.errorString'而超时);
幻影。出口(1);
}
page.open(URLarg,函数(){
//onLoadFinished在此执行
var page_content=page.content;
var body_innerHTML=page.evaluate(函数(){
返回document.body.innerHTML?document.body.innerHTML:'(空)';
});
var title=page.evaluate(函数(){return document.title;});
//page.render(URLdump);/“屏幕截图”
v('VERBOSE:Loading time'+(Date.now()-timestamp)+'msec');
d('DEBUG:Page title:'+((title=='')'(none)':title));
d('DEBUG:body_innerHTML.length='+body_innerHTML.length);
d(“”);
如果(完整页面| |(!body_innerHTML)| | body_innerHTML.length<9){
console.log(page_content);//如果正文为空,则返回all
}否则{
log(body_innerHTML);
}
setTimeout(函数(){
v('VERBOSE:status'+theStatusPrev+'表示'+thePrevURL+'(b)');
v('VERBOSE:status'+theStatusCode+'表示'+thecurrull+'(c)');
var argIs,getArg,d;
var customHeaders = {};
// grab the "rendered" HTML of a JavaScript-requiring web page
// TBD:
// add '-' as magic filename to read from STDIN
// add more curl-like switches? --or just let curl do that and consume the output of curl?
// add a switch for page.render( URLdump); // "screenshot"
var system = require('system'); // var args = require('system').args;
var page = require('webpage').create();
if (system.args.length === 1) {
console.log('Usage: curl-phantom.js <http://URL/path/file.ext>');
console.log(system.args);
// note: can also read "pages" from the local filesystem
phantom.exit();
};
var URLarg=system.args[1];
var theStatusCode = null;
var theStatusPrev = null;
var thePrevURL = '' ;
var theCurrURL = '' ;
var timestamp = Date.now();
var verbose = false;
var debug = true;
var full_page = false;
var header_key = 'X-Forwarded-For';
var header_val = '3.1.20.13';
var requestTimeout= 5000; // Default request timeout
argIs = function(i, name){
if (system.args[i].indexOf(name) == 0 ) {
return true;
}
return false;
}
getArg = function(i) {
return system.args[i].trim();
}
v = function(a,b) {
verbose && console.log(a,b)
}
d = function(a,b) {
debug && console.log(a,b)
}
for (var i=1; i<system.args.length; i++) {
if (argIs(i, '--debug')) {
debug = true;
d('DEBUG: ' + getArg(i));
}
else if (argIs(i, '--full_page')) {
full_page = true;
d('PAGE: ' + getArg(i));
}
else if (argIs(i, '-H', '--header')) {
var arg = getArg(++i);
var arr = arg.trim().split(/\s*:\s*/);
var header = {};
var key = arr[0];
var value = (arr.length == 2) ? arr[1] : '';
customHeaders[key] = value;
d('HEADER:', [key, value]);
}
else if (argIs(i, '--verbose')) {
verbose = true;
v('VERBOSE: ' + getArg(i));
}
else if (argIs(i, '--timeout')) {
requestTimeout = getArg(++i);
d('REQUEST_TIMEOUT', requestTimeout);
}
else {
console.log('unknown param: '+getArg(i));
}
}
console.log('################');
console.log('headers and values');
console.log(JSON.stringify(customHeaders));
page.settings.resourceTimeout = requestTimeout;
page.customHeaders = customHeaders;
//page.customHeaders = { header_key : header_val };
v('VERBOSE: ' + header_key +': '+ header_val);
page.onConsoleMessage = function (msg) { // call-back function intercepts console.log messages
d('DEBUG: console.log message="' + msg + '"');
};
page.onLoadFinished = function(status) {
if ( debug ) {
// console.log('Status: ' + status +' after onLoadFinished(' + status +')');
system.stderr.write('OnLoadFinished.Status: ' + (theStatusCode ? theStatusCode : status) +' after onLoadFinished(' + status +')\n');
}
};
page.onResourceReceived = function(resource) {
// if (resource.url == URLarg || (theStatusCode >= 300 && theStatusCode < 400)) {
theStatusPrev = theStatusCode ;
theStatusCode = resource.status;
thePrevURL = theCurrURL ;
theCurrURL = resource.url;
// }
if ( resource.status === 200 ) {
v('VERBOSE status ' + resource.status + ' for ' + resource.url ); // don't usually log standard success
} else {
v('Status Code was: ' + theStatusPrev + ' for ' + thePrevURL );
v('Status Code is : ' + theStatusCode + ' for ' + theCurrURL );
}
};
page.onUrlChanged = function (URLnew) { // call-back function intercepts console.log messages
if ( URLnew === URLarg ) {
d('DEBUG: old/new URL: ' + URLnew + ' --onUrlChanged()');
} else {
v('DEBUG: old URL: ' + URLarg);
v('DEBUG: new URL: ' + URLnew);
}
};
phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function + ')' : ''));
});
}
console.error(msgStack.join('\n'));
};
page.onResourceTimeout = function(request) {
console.error('Request timed out due to ' + request.errorCode + ' - ' + request.errorString);
phantom.exit(1);
}
page.open( URLarg, function () {
// onLoadFinished executes here
var page_content = page.content;
var body_innerHTML= page.evaluate( function() {
return document.body.innerHTML ? document.body.innerHTML : '(empty)' ;
});
var title = page.evaluate(function() {return document.title; });
// page.render( URLdump); // "screenshot"
v('VERBOSE: Loading time '+ ( Date.now() - timestamp ) +' msec');
d('DEBUG: Page title: ' + ((title==='') ? '(none)':title) );
d('DEBUG: body_innerHTML.length='+ body_innerHTML.length);
d(' ');
if ( full_page || ( ! body_innerHTML ) || body_innerHTML.length < 9 ) {
console.log( page_content ); // return all if body is empty
} else {
console.log( body_innerHTML );
}
setTimeout(function() {
v('VERBOSE: status ' + theStatusPrev + ' for ' + thePrevURL + ' (b)');
v('VERBOSE: status ' + theStatusCode + ' for ' + theCurrURL + ' (c)');
}, 1333 ) ; // delay in milliseconds
phantom.exit( theStatusCode);
}) ;