PhantomJS在本地主机上返回状态200,但在实时服务器上返回状态403

PhantomJS在本地主机上返回状态200,但在实时服务器上返回状态403,phantomjs,http-status-code-403,akamai,Phantomjs,Http Status Code 403,Akamai,我必须从给定的url中删除HTML文档。在我的本地主机上,Phantom JS脚本返回url。但在实时服务器上,我得到了403禁止状态 scraper.js var system = require('system'); var page = require('webpage').create(); $url = system.args[1]; page.open($url, function(status) { if (status == "success") {

我必须从给定的url中删除HTML文档。在我的本地主机上,Phantom JS脚本返回url。但在实时服务器上,我得到了403禁止状态

scraper.js

var system = require('system');
var page = require('webpage').create();

$url = system.args[1];

page.open($url, function(status) {


    if (status == "success") {

        var content = page.content;
        console.log(content);
    }

    phantom.exit();

});
PhantomJS命令:

phantomjs scraper.js http://www.submarino.com.br/produto/126862765/
刮刀在其他页面上工作正常。但域名www.submarino.com.br和www.americanas.com.br不起作用。我知道这与Akamai有关。带有错误输出的响应为:

Response (#1, stage "start"): {"body":"","bodySize":300,"contentType":"text/html","headers":[{"name":"Server","value":"AkamaiGHost"},{"name":"Mime-Version","value":"1.0"},{"name":"Content-Type","value":"text/html"},{"name":"Content-Length","value":"300"},{"name":"Expires","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Connection","value":"close"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"start","status":403,"statusText":"Forbidden","time":"2016-08-10T00:38:13.540Z","url":"http://www.submarino.com.br/produto/126862765/"}
Response (#1, stage "end"): {"contentType":"text/html","headers":[{"name":"Server","value":"AkamaiGHost"},{"name":"Mime-Version","value":"1.0"},{"name":"Content-Type","value":"text/html"},{"name":"Content-Length","value":"300"},{"name":"Expires","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Connection","value":"close"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"end","status":403,"statusText":"Forbidden","time":"2016-08-10T00:38:13.541Z","url":"http://www.submarino.com.br/produto/126862765/"}
当它工作正常时,它会返回:

Response (#1, stage "start"): {"body":"","bodySize":30076,"contentType":"text/html;charset=UTF-8","headers":[{"name":"Content-Encoding","value":"gzip"},{"name":"Content-Type","value":"text/html;charset=UTF-8"},{"name":"Server","value":"Apache-Coyote/1.1"},{"name":"X-Powered-By","value":"JSF/1.2"},{"name":"x-tid","value":"CATALOGO-0d4d336f-c0f1-4b71-9663-28fa89b5c123"},{"name":"Cache-Control","value":"max-age=1800"},{"name":"Expires","value":"Wed, 10 Aug 2016 01:10:18 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:40:18 GMT"},{"name":"Connection","value":"keep-alive"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"start","status":200,"statusText":"OK","time":"2016-08-10T00:40:18.388Z","url":"http://www.submarino.com.br/produto/126862765/"}
Response (#1, stage "end"): {"contentType":"text/html;charset=UTF-8","headers":[{"name":"Content-Encoding","value":"gzip"},{"name":"Content-Type","value":"text/html;charset=UTF-8"},{"name":"Server","value":"Apache-Coyote/1.1"},{"name":"X-Powered-By","value":"JSF/1.2"},{"name":"x-tid","value":"CATALOGO-0d4d336f-c0f1-4b71-9663-28fa89b5c123"},{"name":"Cache-Control","value":"max-age=1800"},{"name":"Expires","value":"Wed, 10 Aug 2016 01:10:18 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:40:18 GMT"},{"name":"Connection","value":"keep-alive"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"end","status":200,"statusText":"OK","time":"2016-08-10T00:40:18.390Z","url":"http://www.submarino.com.br/produto/126862765/"}

我试图从hurl.it和其他cURL服务卷取此站点,他们可以访问url。有什么我能做的吗?这快把我逼疯了

很可能是地理位置或可疑的IP范围限制。我刚刚尝试打开url,但也被拒绝了该页面,然后通过美国代理访问该页面,并能够打开它。只需使用美国或巴西利亚代理即可

此外,在抓取时,尽可能接近真实的浏览器行为也很重要,因此我建议您在脚本中添加useragent和视口模拟:

page.viewportSize = { width: 1280, height: 800 };
page.settings.userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
还要确保订阅错误和控制台消息,以了解目标页面中的任何错误和消息

page.onConsoleMessage = function(msg) {
  console.log('CONSOLE: ' + msg);
};

page.onError = function (msg, trace) 
{
    console.log(msg);
    trace.forEach(function(item) {
        console.log(' ', item.file, ':', item.line);
    })
}

您使用了哪种代理服务?我尝试将我的电脑用作代理,但它不起作用…无论您使用什么程序,您的电脑仍具有Akamai不满意的相同IP地址。例如,尝试在纽约获得一个数字海洋水滴,并设置一个SSH隧道作为代理。实际上,我甚至不能用我的本地主机设置代理。cURL或使用phantomjs在我的本地主机上实际上非常有效,我试图在不必首先支付代理费用的情况下测试它。当我试图弄清楚该怎么做时,我被这个链接绊倒了。也许Digital Ocean IP范围也被屏蔽了?如果您在使用代理时遇到问题,您可以提出另一个问题。至于DO或任何其他提供商是否也被阻止的问题,除了尝试和检查之外,没有其他方法。这是一张可以给你10美元的优惠券,你可以试试D.O。您可以一次又一次地销毁和重新创建水滴以获得新的IP地址。