Javascript 通过脚本或其他方式下载文件_Javascript_Jquery_Html_Web Scraping

Javascript 通过脚本或其他方式下载文件

javascript jquery html web-scraping

Javascript 通过脚本或其他方式下载文件,javascript,jquery,html,web-scraping,Javascript,Jquery,Html,Web Scraping,在一个网站，而不是我的，有一个搜索结果 <a href="show?file=191719&token=r1j"> <a href="show?file=191720&token=gh5"> <a href="show?file=191721&token=98j"> ..... <a href="show?file=191733&token=ty0"> ..... 点击其中一个页面后，我进入一个页面，填写表格，

在一个网站，而不是我的，有一个搜索结果

<a href="show?file=191719&token=r1j">
<a href="show?file=191720&token=gh5">
<a href="show?file=191721&token=98j">
.....
<a href="show?file=191733&token=ty0">


.....

点击其中一个页面后，我进入一个页面，填写表格，进入下载页面后，我点击链接：

<a href="download?file=191719&token=r1j">

我必须手动完成150个文件，因为它太长了

我想要的是通过使用脚本或其他东西，我直接下载所有文件，方法是在结果页面中获取文件id并将其放入下载链接。

您可以使用excel生成链接，将其保存为txt文件，并使用wget和-i参数下载

使用此javascript片段，其中

http://www.that-website.com/

是该网站的url，如果文件太多，不要一次下载所有文件，通过指定开始和结束文件编号，每次下载几十个文件，请注意，浏览器弹出窗口阻止程序将阻止此操作，因此您需要允许在浏览器的弹出窗口阻止程序中从此网页弹出窗口

JS:

var fileNumber,
start = 191719,
finish = 191729;
for(fileNumber = start; fileNumber <= finish; ++fileNumber){
    window.open("http://www.that-website.com/download?file=" + fileNumber);
}

等等，几十个

更新2: 请参见本文中的示例


文件
$（文档）.ready（函数（）{
$（'a'）。每个（函数（）{
var showLink=$（this.attr（'href'）；
var downloadLink=showLink.replace（“显示文件”、“下载文件”）；
窗口打开（“http://www.example.com/“+下载链接）；
});
});

使用上述代码，这是您计算机上的一个HTML页面，从该网站页面链接复制多个原始链接，如：

到您的本地页面并运行它，但强烈建议您每次粘贴10-30个链接。

您可以使用以blob形式并行下载文件，然后使用

启动下载行为。不过，这将有相同的原产地政策限制

总体思路是

// fetch
var xhr = new XMLHttpRequest();
xhr.addEventListener('load', function () {
    var uri = URL.createObjectURL(this.response); // generate URI to access Blob
    // write, see below
});
xhr.open('GET', target_file_href);
xhr.responseType = 'blob'; // state we want the target as a blob/file
xhr.send(); // send the request
// ---------------

// write
var a = document.createElement('a');
a.href = uri;
a.setAttribute('download'); // make this a download link rather than a change page
document.body.appendChild(a);
a.click();
// cleanup a, uri

这是我在ES5中编写的一个并行文件下载程序，它限制了并发下载的数量

function ParallelDownloader(max_parallel, retry_on_error) {
    this.links = [];
    this.current = 0;
    this.max_parallel = max_parallel || 5;
    this.retry_on_error = !!retry_on_error;
}
ParallelDownloader.prototype = Object.create(null);
ParallelDownloader.prototype.add = function (url) {
    if ('splice' in url && 'length' in url)
        this.links.push.apply(this.links, url);
    else
        this.links.push(url);
    this.downloadNext();
};
ParallelDownloader.prototype.downloadNext = (function () {
    function load() {
        var a = document.createElement('a'),
            uri = URL.createObjectURL(this.response),
            cd = this.getResponseHeader('Content-Disposition'),
            filename = null;
        if (cd) {
            cd = cd.match(/;\s+filename=(.+)/);
            if (cd) filename = cd[1];
        }
        if (null === filename) {
            cd = this.__url.match(/\/([^/]+?(?=\?|$))/);
            if (cd) filename = cd[1];
        }
        if (null !== filename) a.setAttribute('download', filename);
        else a.setAttribute('download');
        a.setAttribute('href', uri);
        document.body.appendChild(a);
        a.click();
        document.body.removeChild(a);
        URL.revokeObjectURL(uri);
        --this.__parallelDownloader.current;
        this.__parallelDownloader.downloadNext();
    }
    function error() {
        --this.__parallelDownloader.current;
        if (this.__parallelDownloader.retry_on_error) {
            console.warn('Will retry', this.__url);
            this.__parallelDownloader.unshift(this.__url);
        }
        this.__parallelDownloader.downloadNext();
    }
    return function () {
        var url;
        ++this.current;
        if (this.current > this.max_parallel || this.links.length === 0) {
            --this.current;
            return;
        }
        url = this.links.shift();
        var xhr = new XMLHttpRequest();
        xhr.__parallelDownloader = this;
        xhr.__url = url;
        xhr.addEventListener('load', load);
        xhr.addEventListener('error', error);
        xhr.open('GET', url);
        xhr.responseType = 'blob';
        xhr.send();
        this.downloadNext();
    };
}());

要使用它，你可以这样做

var pd = new ParallelDownloader(10); // max 10 concurrent downloads
pd.add([
    '/path1.txt', '/path2.pub', '/path3.pdf'
]);
// or
pd.add('/path4.txt');
pd.add('/path5.txt');
// etc

一旦添加链接并且有空闲插槽，下载尝试就会启动。（如果您启用

重试\u on \u error

我没有限制它，因此您可能会得到无限循环）

如何使用excel生成链接？是的，这很好，但我忘了说查询中还有其他参数查看我编辑的问题。。如果ID没有排序，我想首先用脚本获取ID和另一个参数，然后将它们取下来。由于令牌通常是随机的，在这种情况下它们也不能用于循环，因此请键入10行

window.open（）

每行带有其文件号和特殊的随机令牌，我将更新答案以显示一个示例Yes，但没有在html代码中搜索ID和令牌的方法？因为手动操作150次，我不会从你的解决方案中赢得任何东西。你成功逃脱表单填充150次：），但是你可以保存页面，并获取每个链接的文件和令牌值，但这取决于网页的DOM结构，可能需要一些技巧来添加一个类名，例如，用jquery来提取所有的类名，所以保存页面并在该页面中显示这些链接的HTML行我在我的问题中显示了HTML代码中有几行

链接。页面是什么？我会为你准备一个例子you@Andr埃斯佩雷斯·阿尔贝拉。你必须从这个链接开始搜索，因为这是一个帖子链接，你能解释一下吗？请提供更多详细信息，我不知道那些在哪里，谢谢你，但是我有一个

post

url而不是

get

，我首先得到了html代码，所以用

post

是不可能的。不幸的是。

function ParallelDownloader(max_parallel, retry_on_error) {
    this.links = [];
    this.current = 0;
    this.max_parallel = max_parallel || 5;
    this.retry_on_error = !!retry_on_error;
}
ParallelDownloader.prototype = Object.create(null);
ParallelDownloader.prototype.add = function (url) {
    if ('splice' in url && 'length' in url)
        this.links.push.apply(this.links, url);
    else
        this.links.push(url);
    this.downloadNext();
};
ParallelDownloader.prototype.downloadNext = (function () {
    function load() {
        var a = document.createElement('a'),
            uri = URL.createObjectURL(this.response),
            cd = this.getResponseHeader('Content-Disposition'),
            filename = null;
        if (cd) {
            cd = cd.match(/;\s+filename=(.+)/);
            if (cd) filename = cd[1];
        }
        if (null === filename) {
            cd = this.__url.match(/\/([^/]+?(?=\?|$))/);
            if (cd) filename = cd[1];
        }
        if (null !== filename) a.setAttribute('download', filename);
        else a.setAttribute('download');
        a.setAttribute('href', uri);
        document.body.appendChild(a);
        a.click();
        document.body.removeChild(a);
        URL.revokeObjectURL(uri);
        --this.__parallelDownloader.current;
        this.__parallelDownloader.downloadNext();
    }
    function error() {
        --this.__parallelDownloader.current;
        if (this.__parallelDownloader.retry_on_error) {
            console.warn('Will retry', this.__url);
            this.__parallelDownloader.unshift(this.__url);
        }
        this.__parallelDownloader.downloadNext();
    }
    return function () {
        var url;
        ++this.current;
        if (this.current > this.max_parallel || this.links.length === 0) {
            --this.current;
            return;
        }
        url = this.links.shift();
        var xhr = new XMLHttpRequest();
        xhr.__parallelDownloader = this;
        xhr.__url = url;
        xhr.addEventListener('load', load);
        xhr.addEventListener('error', error);
        xhr.open('GET', url);
        xhr.responseType = 'blob';
        xhr.send();
        this.downloadNext();
    };
}());

var pd = new ParallelDownloader(10); // max 10 concurrent downloads
pd.add([
    '/path1.txt', '/path2.pub', '/path3.pdf'
]);
// or
pd.add('/path4.txt');
pd.add('/path5.txt');
// etc