使用curl语法获取单页应用程序的生成源代码?

使用curl语法获取单页应用程序的生成源代码?,curl,single-page-application,Curl,Single Page Application,我一直在命令行中使用curl来请求url并解析其标记 对于经过身份验证的页面,我很容易做到这一点,方法是进入Chrome,加载url,然后打开inspector,在网络历史记录的顶部找到url,右键单击它,然后选择Copy | Copy as Curl 我想对一个单页应用程序做同样的事情,它当然会运行大量的其他东西来呈现自己,比如javascript,或者其他任何东西 有没有工具可以让我轻松地将“curl”更改为其他内容,并下载生成的页面源代码 e、 g.通常,如果不是单页应用程序(从Chrom

我一直在命令行中使用curl来请求url并解析其标记

对于经过身份验证的页面,我很容易做到这一点,方法是进入Chrome,加载url,然后打开inspector,在网络历史记录的顶部找到url,右键单击它,然后选择
Copy | Copy as Curl

我想对一个单页应用程序做同样的事情,它当然会运行大量的其他东西来呈现自己,比如javascript,或者其他任何东西

有没有工具可以让我轻松地将“curl”更改为其他内容,并下载生成的页面源代码

e、 g.通常,如果不是单页应用程序(从Chrome复制),我会运行此程序以获取经过身份验证的页面的源代码

我想把它转换成其他的东西,它接收所有的头,最好是与curl完全相同的语法,并给我生成的源代码

downloadGeneratedSource 'https://mywebsite.com/singlePageApplication' \
  -H 'Connection: keep-alive' \
  -H 'Pragma: no-cache' \
  -H 'Cache-Control: no-cache' \
  -H 'Accept-Language: en,en-US;q=0.9' \
  -H 'Cookie: session=XXX"
这在任何地方都存在吗?

正如和在他们的评论中指出的,或者是设计用于模拟浏览用户行为的奇特工具,从而允许您以轻松配置的方式下载(SPA)的源代码

另一方面,如果在脚本中使用,您可以做类似的事情,这是对的。在21世纪初,我与grep、awk、sed和perl一起使用,通过使用创建的动态url自动下载访问控制页面。这确实是一个与现在的温泉非常相似的场景

我选择了
wget
而不是
curl
,因为管道处理它的输出更容易,但是有必要根据您的特定用例定制这样的脚本。如果你能流利地使用,那就需要几分钟的时间,因为目标URL有一些我可以查找的语法-也许你也可以这样做

进一步阅读
  • -两种工具的良好(历史)比较
  • Python方法是否用于站点刮取

我在谷歌上搜索了一下,在phantomjs中找到了一个类似的实现,并试图修改它以适应这个用例,尽管它似乎不起作用。我似乎找不到我以此为基础的要点-(但我在这里把它作为解决方案的一个裂缝;-)

旁注:我只是在做更多的谷歌搜索时发现了这一点

下载生成源

var argIs,getArg,d;
var customHeaders = {};

// grab the "rendered" HTML of a JavaScript-requiring web page

// TBD:
// add '-' as magic filename to read from STDIN
// add more curl-like switches? --or just let curl do that and consume the output of curl?
// add a switch for page.render( URLdump); // "screenshot"

var system = require('system'); // var args = require('system').args;
var page = require('webpage').create();

if (system.args.length === 1) {
    console.log('Usage: curl-phantom.js <http://URL/path/file.ext>');
    console.log(system.args);
    // note: can also read "pages" from the local filesystem
    phantom.exit();
};

var URLarg=system.args[1];
var theStatusCode = null;
var theStatusPrev = null;
var thePrevURL    = ''  ;
var theCurrURL    = ''  ;
var timestamp     = Date.now();
var verbose       = false;
var debug         = true;
var full_page     = false;
var header_key    = 'X-Forwarded-For';
var header_val    = '3.1.20.13';
var requestTimeout= 5000;   // Default request timeout

argIs = function(i, name){
  if (system.args[i].indexOf(name) == 0 ) {
    return true;
  }
  return false;
}

getArg = function(i) {
  return system.args[i].trim();
}

v = function(a,b) {
  verbose && console.log(a,b)
}

d = function(a,b) {
  debug && console.log(a,b)
}

for (var i=1; i<system.args.length; i++) { 
  if (argIs(i, '--debug')) {
    debug = true; 
    d('DEBUG: ' + getArg(i)); 
  }
  else if (argIs(i, '--full_page')) {
    full_page = true; 
    d('PAGE: ' + getArg(i)); 
  }
  else if (argIs(i, '-H', '--header')) {
    var arg = getArg(++i); 
    var arr = arg.trim().split(/\s*:\s*/);
    var header = {};
    var key = arr[0];
    var value = (arr.length == 2) ? arr[1] : '';
    customHeaders[key] = value;

    d('HEADER:', [key, value]);
  }
  else if (argIs(i, '--verbose')) {
    verbose   = true; 
    v('VERBOSE: ' + getArg(i)); 
  }
  else if (argIs(i, '--timeout')) {
    requestTimeout = getArg(++i);  
    d('REQUEST_TIMEOUT', requestTimeout);
  }
  else {
    console.log('unknown param: '+getArg(i)); 
  }
}
console.log('################');
console.log('headers and values');
console.log(JSON.stringify(customHeaders));

page.settings.resourceTimeout = requestTimeout;

page.customHeaders = customHeaders;
//page.customHeaders = { header_key : header_val };
v('VERBOSE: ' + header_key +': '+ header_val);

page.onConsoleMessage = function (msg) { // call-back function intercepts console.log messages
    d('DEBUG: console.log message="' + msg + '"');
};

page.onLoadFinished = function(status) {
  if ( debug ) {
    // console.log('Status: ' + status +' after onLoadFinished(' + status +')');
    system.stderr.write('OnLoadFinished.Status: ' + (theStatusCode ? theStatusCode : status) +' after onLoadFinished(' + status +')\n');
  }
};

page.onResourceReceived = function(resource) {
  // if (resource.url == URLarg || (theStatusCode >= 300 && theStatusCode < 400)) {
    theStatusPrev = theStatusCode  ;
    theStatusCode = resource.status;
    thePrevURL    = theCurrURL  ;
    theCurrURL    = resource.url;
  // }
    if ( resource.status === 200 ) {
        v('VERBOSE status ' + resource.status + ' for ' + resource.url ); // don't usually log standard success
    } else {
        v('Status Code was: ' + theStatusPrev   + ' for ' + thePrevURL );
        v('Status Code is : ' + theStatusCode   + ' for ' + theCurrURL );
    }
};

page.onUrlChanged = function (URLnew) { // call-back function intercepts console.log messages
    if ( URLnew === URLarg ) {
      d('DEBUG: old/new URL: ' + URLnew + ' --onUrlChanged()');
    } else {
      v('DEBUG: old URL: ' + URLarg);
      v('DEBUG: new URL: ' + URLnew);
    }
};

phantom.onError = function(msg, trace) {
    var msgStack = ['PHANTOM ERROR: ' + msg];
    if (trace) {
        msgStack.push('TRACE:');
        trace.forEach(function(t) {
            msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function + ')' : ''));
        });
    }
    console.error(msgStack.join('\n'));
};

page.onResourceTimeout = function(request) {
    console.error('Request timed out due to ' + request.errorCode + ' - ' + request.errorString);
    phantom.exit(1);
}

page.open( URLarg, function () {
    // onLoadFinished executes here
    var page_content = page.content;
    var body_innerHTML= page.evaluate( function() {
      return document.body.innerHTML ? document.body.innerHTML : '(empty)' ;
    });
    var title = page.evaluate(function() {return document.title; });

    // page.render( URLdump); // "screenshot"
    v('VERBOSE: Loading time '+ ( Date.now() - timestamp ) +' msec');
    d('DEBUG: Page title: ' + ((title==='') ? '(none)':title) );
    d('DEBUG: body_innerHTML.length='+ body_innerHTML.length);
    d(' ');

    if ( full_page  || ( ! body_innerHTML ) || body_innerHTML.length < 9 ) {
      console.log( page_content   ); // return all if body is empty
    } else {
      console.log( body_innerHTML );
    }
    setTimeout(function() {
        v('VERBOSE: status ' + theStatusPrev   + ' for ' + thePrevURL + ' (b)');
        v('VERBOSE: status ' + theStatusCode   + ' for ' + theCurrURL + ' (c)');
      }, 1333 ) ; // delay in milliseconds
    phantom.exit( theStatusCode);
  }) ;

var argIs,getArg,d;
var customHeaders={};
//抓取需要JavaScript的网页的“呈现”HTML
//待定:
//添加“-”作为从标准输入读取的魔法文件名
//添加更多类似卷曲的开关--或者让curl这样做并消耗curl的输出?
//为page.render(URLdump)添加开关;//“截图”
var system=require('system');//var args=require('system')。args;
var page=require('webpage')。create();
if(system.args.length==1){
log('用法:curl-phantom.js');
console.log(system.args);
//注意:还可以从本地文件系统读取“页面”
phantom.exit();
};
var URLarg=system.args[1];
var theStatusCode=null;
var theStatusPrev=null;
var thePrevURL='';
var thecurrull='';
var timestamp=Date.now();
var verbose=false;
var debug=true;
var full_page=false;
var header_key='X-Forwarded-For';
风险值标题_val='3.1.20.13';
var requestTimeout=5000;//默认请求超时
argIs=函数(i,名称){
if(system.args[i].indexOf(name)==0){
返回true;
}
返回false;
}
getArg=函数(i){
返回系统.args[i].trim();
}
v=功能(a,b){
verbose&&console.log(a,b)
}
d=功能(a,b){
调试和控制台日志(a、b)
}
对于(变量i=1;i=300&&statuscode<400)){
statusprev=statuscode;
statuscode=resource.status;
thePrevURL=当前URL;
currURL=resource.url;
// }
如果(resource.status==200){
v('VERBOSE status'+resource.status+'表示'+resource.url);//通常不记录标准成功
}否则{
v('状态代码为:'+theStatusPrev+'表示'+thePrevURL');
v('状态代码为:'+theStatusCode+'表示'+theCurrURL');
}
};
page.onUrlChanged=函数(URLnew){//回调函数截获console.log消息
如果(URLnew==URLarg){
d('DEBUG:old/newurl:'+URLnew+'--onUrlChanged()');
}否则{
v('DEBUG:old URL:'+URLarg);
v('DEBUG:newurl:'+URLnew);
}
};
phantom.onError=函数(消息,跟踪){
var msgStack=['PHANTOM ERROR:'+msg];
如果(跟踪){
msgStack.push('TRACE:');
trace.forEach(函数(t){
msgStack.push('->'+(t.file | | t.sourceURL)+':'+t.line+(t.function?'(在函数'+t.function+'):'');
});
}
console.error(msgStack.join('\n'));
};
page.onResourceTimeout=函数(请求){
console.error('请求因'+Request.errorCode+'-'+Request.errorString'而超时);
幻影。出口(1);
}
page.open(URLarg,函数(){
//onLoadFinished在此执行
var page_content=page.content;
var body_innerHTML=page.evaluate(函数(){
返回document.body.innerHTML?document.body.innerHTML:'(空)';
});
var title=page.evaluate(函数(){return document.title;});
//page.render(URLdump);/“屏幕截图”
v('VERBOSE:Loading time'+(Date.now()-timestamp)+'msec');
d('DEBUG:Page title:'+((title=='')'(none)':title));
d('DEBUG:body_innerHTML.length='+body_innerHTML.length);
d(“”);
如果(完整页面| |(!body_innerHTML)| | body_innerHTML.length<9){
console.log(page_content);//如果正文为空,则返回all
}否则{
log(body_innerHTML);
}
setTimeout(函数(){
v('VERBOSE:status'+theStatusPrev+'表示'+thePrevURL+'(b)');
v('VERBOSE:status'+theStatusCode+'表示'+thecurrull+'(c)');
var argIs,getArg,d;
var customHeaders = {};

// grab the "rendered" HTML of a JavaScript-requiring web page

// TBD:
// add '-' as magic filename to read from STDIN
// add more curl-like switches? --or just let curl do that and consume the output of curl?
// add a switch for page.render( URLdump); // "screenshot"

var system = require('system'); // var args = require('system').args;
var page = require('webpage').create();

if (system.args.length === 1) {
    console.log('Usage: curl-phantom.js <http://URL/path/file.ext>');
    console.log(system.args);
    // note: can also read "pages" from the local filesystem
    phantom.exit();
};

var URLarg=system.args[1];
var theStatusCode = null;
var theStatusPrev = null;
var thePrevURL    = ''  ;
var theCurrURL    = ''  ;
var timestamp     = Date.now();
var verbose       = false;
var debug         = true;
var full_page     = false;
var header_key    = 'X-Forwarded-For';
var header_val    = '3.1.20.13';
var requestTimeout= 5000;   // Default request timeout

argIs = function(i, name){
  if (system.args[i].indexOf(name) == 0 ) {
    return true;
  }
  return false;
}

getArg = function(i) {
  return system.args[i].trim();
}

v = function(a,b) {
  verbose && console.log(a,b)
}

d = function(a,b) {
  debug && console.log(a,b)
}

for (var i=1; i<system.args.length; i++) { 
  if (argIs(i, '--debug')) {
    debug = true; 
    d('DEBUG: ' + getArg(i)); 
  }
  else if (argIs(i, '--full_page')) {
    full_page = true; 
    d('PAGE: ' + getArg(i)); 
  }
  else if (argIs(i, '-H', '--header')) {
    var arg = getArg(++i); 
    var arr = arg.trim().split(/\s*:\s*/);
    var header = {};
    var key = arr[0];
    var value = (arr.length == 2) ? arr[1] : '';
    customHeaders[key] = value;

    d('HEADER:', [key, value]);
  }
  else if (argIs(i, '--verbose')) {
    verbose   = true; 
    v('VERBOSE: ' + getArg(i)); 
  }
  else if (argIs(i, '--timeout')) {
    requestTimeout = getArg(++i);  
    d('REQUEST_TIMEOUT', requestTimeout);
  }
  else {
    console.log('unknown param: '+getArg(i)); 
  }
}
console.log('################');
console.log('headers and values');
console.log(JSON.stringify(customHeaders));

page.settings.resourceTimeout = requestTimeout;

page.customHeaders = customHeaders;
//page.customHeaders = { header_key : header_val };
v('VERBOSE: ' + header_key +': '+ header_val);

page.onConsoleMessage = function (msg) { // call-back function intercepts console.log messages
    d('DEBUG: console.log message="' + msg + '"');
};

page.onLoadFinished = function(status) {
  if ( debug ) {
    // console.log('Status: ' + status +' after onLoadFinished(' + status +')');
    system.stderr.write('OnLoadFinished.Status: ' + (theStatusCode ? theStatusCode : status) +' after onLoadFinished(' + status +')\n');
  }
};

page.onResourceReceived = function(resource) {
  // if (resource.url == URLarg || (theStatusCode >= 300 && theStatusCode < 400)) {
    theStatusPrev = theStatusCode  ;
    theStatusCode = resource.status;
    thePrevURL    = theCurrURL  ;
    theCurrURL    = resource.url;
  // }
    if ( resource.status === 200 ) {
        v('VERBOSE status ' + resource.status + ' for ' + resource.url ); // don't usually log standard success
    } else {
        v('Status Code was: ' + theStatusPrev   + ' for ' + thePrevURL );
        v('Status Code is : ' + theStatusCode   + ' for ' + theCurrURL );
    }
};

page.onUrlChanged = function (URLnew) { // call-back function intercepts console.log messages
    if ( URLnew === URLarg ) {
      d('DEBUG: old/new URL: ' + URLnew + ' --onUrlChanged()');
    } else {
      v('DEBUG: old URL: ' + URLarg);
      v('DEBUG: new URL: ' + URLnew);
    }
};

phantom.onError = function(msg, trace) {
    var msgStack = ['PHANTOM ERROR: ' + msg];
    if (trace) {
        msgStack.push('TRACE:');
        trace.forEach(function(t) {
            msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function + ')' : ''));
        });
    }
    console.error(msgStack.join('\n'));
};

page.onResourceTimeout = function(request) {
    console.error('Request timed out due to ' + request.errorCode + ' - ' + request.errorString);
    phantom.exit(1);
}

page.open( URLarg, function () {
    // onLoadFinished executes here
    var page_content = page.content;
    var body_innerHTML= page.evaluate( function() {
      return document.body.innerHTML ? document.body.innerHTML : '(empty)' ;
    });
    var title = page.evaluate(function() {return document.title; });

    // page.render( URLdump); // "screenshot"
    v('VERBOSE: Loading time '+ ( Date.now() - timestamp ) +' msec');
    d('DEBUG: Page title: ' + ((title==='') ? '(none)':title) );
    d('DEBUG: body_innerHTML.length='+ body_innerHTML.length);
    d(' ');

    if ( full_page  || ( ! body_innerHTML ) || body_innerHTML.length < 9 ) {
      console.log( page_content   ); // return all if body is empty
    } else {
      console.log( body_innerHTML );
    }
    setTimeout(function() {
        v('VERBOSE: status ' + theStatusPrev   + ' for ' + thePrevURL + ' (b)');
        v('VERBOSE: status ' + theStatusCode   + ' for ' + theCurrURL + ' (c)');
      }, 1333 ) ; // delay in milliseconds
    phantom.exit( theStatusCode);
  }) ;