Php 将图像从live server复制到本地
我在不同的表中有大约600k的图像URL,我正在下载所有的图像和下面的代码,它工作得很好。(我知道FTP是最好的选择,但不知何故我不能使用它。)Php 将图像从live server复制到本地,php,Php,我在不同的表中有大约600k的图像URL,我正在下载所有的图像和下面的代码,它工作得很好。(我知道FTP是最好的选择,但不知何故我不能使用它。) $queryRes=mysql_query(“从tablName LIMIT 50000中选择url”);//每次我使用极限 while($row=mysql\u fetch\u object($queryRes)){ $info=pathinfo($row->url); $fileName=$info['fileName']; $fileExtens
$queryRes=mysql_query(“从tablName LIMIT 50000中选择url”);//每次我使用极限
while($row=mysql\u fetch\u object($queryRes)){
$info=pathinfo($row->url);
$fileName=$info['fileName'];
$fileExtension=$info['extension'];
试一试{
复制(“http:”..row->url,“img/$fileName”。“..$row->id.”“..$fileExtension);
}捕获(例外$e){
echo“
\n无法复制“$fileName”。错误:$e”;
}
}
问题是:
我希望我解释得很好。我自己没有使用
copy
,我会使用file\u-get\u-contents
它可以与远程服务器配合使用
编辑:
也返回false。所以
if( false === file_get_contents(...) )
trigger_error(...);
503是一个相当普遍的错误,在本例中,这可能意味着某些内容超时。这可能是您的web服务器、某个地方的代理,甚至是PHP 您需要确定哪个组件正在超时。如果是PHP,您可以使用set_time_limit 另一种选择可能是分解工作,以便每个请求只处理一个文件,然后重定向回同一个脚本以继续处理其余文件。您必须以某种方式维护一个列表,其中列出了在调用之间已处理的文件。或按数据库id的顺序处理,并在重定向时将上次使用的id传递给脚本
//only fetch 50 urls each time
$queryRes = mysql_query ( "select id, url from tablName where flag=1 limit 50" );
//just prefer absolute path
$imgDirPath = dirname ( __FILE__ ) + '/';
while ( $row = mysql_fetch_object ( $queryRes ) )
{
$info = pathinfo ( $row->url );
$fileName = $info ['filename'];
$fileExtension = $info ['extension'];
//url in the table is like //www.example.com???
$result = fetchUrl ( "http:" . $row->url,
$imgDirPath + "img/$fileName" . "_" . $row->id . "." . $fileExtension );
if ($result !== true)
{
echo "<br/>\n unable to copy '$fileName'. Error:$result";
//update flag to 3, finish this func yourself
set_row_flag ( 3, $row->id );
}
else
{
//update flag to 3
set_row_flag ( 2, $row->id );
}
}
function fetchUrl($url, $saveto)
{
$ch = curl_init ( $url );
curl_setopt ( $ch, CURLOPT_FOLLOWLOCATION, true );
curl_setopt ( $ch, CURLOPT_MAXREDIRS, 3 );
curl_setopt ( $ch, CURLOPT_HEADER, false );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt ( $ch, CURLOPT_CONNECTTIMEOUT, 7 );
curl_setopt ( $ch, CURLOPT_TIMEOUT, 60 );
$raw = curl_exec ( $ch );
$error = false;
if (curl_errno ( $ch ))
{
$error = curl_error ( $ch );
}
else
{
$httpCode = curl_getinfo ( $ch, CURLINFO_HTTP_CODE );
if ($httpCode != 200)
{
$error = 'HTTP code not 200: ' . $httpCode;
}
}
curl_close ( $ch );
if ($error)
{
return $error;
}
file_put_contents ( $saveto, $raw );
return true;
}
//每次仅获取50个URL
$queryRes=mysql_query(“从tablName中选择id、url,其中flag=1 limit 50”);
//只是喜欢绝对路径
$imgDirPath=dirname(_文件__)+'/';
while($row=mysql\u fetch\u object($queryRes))
{
$info=pathinfo($row->url);
$fileName=$info['fileName'];
$fileExtension=$info['extension'];
//表中的url类似于//www.example.com???
$result=fetchUrl(“http:.”$row->url,
$imgDirPath+“img/$fileName”“。”。“$row->id”“。”$fileExtension);
如果($result!==true)
{
echo“
\n无法复制“$fileName”。错误:$result”;
//将标志更新为3,自己完成此功能
设置行标志(3,$row->id);
}
其他的
{
//将标志更新为3
设置行标志(2,$row->id);
}
}
函数fetchUrl($url,$saveto)
{
$ch=curl\u init($url);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true);
curl_setopt($ch,CURLOPT_MAXREDIRS,3);
curl_setopt($ch,CURLOPT_头,false);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,7);
curl_setopt($ch,CURLOPT_超时,60);
$raw=curl\u exec($ch);
$error=false;
if(旋度误差($ch))
{
$error=curl\u error($ch);
}
其他的
{
$httpCode=curl\u getinfo($ch,CURLINFO\u HTTP\u代码);
如果($httpCode!=200)
{
$error='HTTP代码不是200:'。$httpCode;
}
}
卷曲关闭($ch);
如果($error)
{
返回$error;
}
文件内容($saveto,$raw);
返回true;
}
首先。。。复制不会引发任何异常。。。所以你没有做任何错误处理。。。这就是为什么您的脚本将继续运行
第二。。。你应该使用file\u get\u contets或者更好的,curl
例如,您可以尝试此功能。。。(我知道…它每次都是打开和关闭的卷曲…这就是我在这里找到的一个例子)
甚至。。尝试使用curl\u multi\u exec并并行加载文件,这样会更快
请看这里:
编辑:
要跟踪下载失败的wich文件,您需要执行以下操作
$queryRes = mysql_query("select url from tablName limit 50000"); //everytime i am using limit
while($row = mysql_fetch_object($queryRes)) {
$info = pathinfo($row->url);
$fileName = $info['filename'];
$fileExtension = $info['extension'];
if (!@copy("http:".$row->url, "img/$fileName"."_".$row->id.".".$fileExtension)) {
$errors= error_get_last();
echo "COPY ERROR: ".$errors['type'];
echo "<br />\n".$errors['message'];
//you can add what ever code you wnat here... out put to conselo, log in a file put an exit() to stop dowloading...
}
}
$queryRes=mysql\u查询(“从tablName limit 50000中选择url”)//每次我使用极限
while($row=mysql\u fetch\u object($queryRes)){
$info=pathinfo($row->url);
$fileName=$info['fileName'];
$fileExtension=$info['extension'];
如果(!@copy(“http:“.$row->url,“img/$fileName”。”。“.$row->id.”。$fileExtension)){
$errors=error_get_last();
回显“复制错误:”.$errors['type'];
回显“
\n”。$errors['message'];
//您可以在这里添加任何代码…输出到conselo,登录一个文件并退出()以停止加载。。。
}
}
更多信息:
mysql\u fetch\u对象
function getimg($url) { $headers[] = 'Accept: image/gif, image/x-bitmap, image/jpeg, image/pjpeg'; $headers[] = 'Connection: Keep-Alive'; $headers[] = 'Content-type: application/x-www-form-urlencoded;charset=UTF-8'; $user_agent = 'php'; $process = curl_init($url); curl_setopt($process, CURLOPT_HTTPHEADER, $headers); curl_setopt($process, CURLOPT_HEADER, 0); curl_setopt($process, CURLOPT_USERAGENT, $useragent); curl_setopt($process, CURLOPT_TIMEOUT, 30); curl_setopt($process, CURLOPT_RETURNTRANSFER, 1); curl_setopt($process, CURLOPT_FOLLOWLOCATION, 1); $return = curl_exec($process); curl_close($process); return $return; }
$queryRes = mysql_query("select url from tablName limit 50000"); //everytime i am using limit
while($row = mysql_fetch_object($queryRes)) {
$info = pathinfo($row->url);
$fileName = $info['filename'];
$fileExtension = $info['extension'];
if (!@copy("http:".$row->url, "img/$fileName"."_".$row->id.".".$fileExtension)) {
$errors= error_get_last();
echo "COPY ERROR: ".$errors['type'];
echo "<br />\n".$errors['message'];
//you can add what ever code you wnat here... out put to conselo, log in a file put an exit() to stop dowloading...
}
}
$queryRes = mysql_query("SELECT id, url FROM tablName ORDER BY id");
while (($row = mysql_fetch_object($queryRes)) !== false) {
$info = pathinfo($row->url);
$fn = $info['filename'];
if (copy(
'http:' . $row->url,
"img/{$fn}_{$row->id}.{$info['extension']}"
)) {
echo "success: $fn\n";
} else {
echo "fail: $fn\n";
}
flush();
}
CREATE TABLE IF NOT EXISTS `images` (
`id` int(60) NOT NULL AUTO_INCREMENTh,
`link` varchar(1024) NOT NULL,
`status` enum('not fetched','fetched') NOT NULL DEFAULT 'not fetched',
`timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
);
<?php
// how many images to download in one go?
$limit = 100;
/* if set to true, the scraper reloads itself. Good for running on localhost without cron job support. Just keep the browser open and the script runs by itself ( javascript is needed) */
$reload = false;
// to prevent php timeout
set_time_limit(0);
// db connection ( you need pdo enabled)
try {
$host = 'localhost';
$dbname= 'mydbname';
$user = 'root';
$pass = '';
$DBH = new PDO("mysql:host=$host;dbname=$dbname", $user, $pass);
}
catch(PDOException $e) {
echo $e->getMessage();
}
$DBH->setAttribute( PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION );
// get n number of images that are not fetched
$query = $DBH->prepare("SELECT * FROM images WHERE status = 'not fetched' LIMIT {$limit}");
$query->execute();
$files = $query->fetchAll();
// if no result, don't run
if(empty($files)){
echo 'All files have been fetched!!!';
die();
}
// where to save the images?
$savepath = dirname(__FILE__).'/scrapped/';
// fetch 'em!
foreach($files as $file){
// get_url_content uses curl. Function defined later-on
$content = get_url_content($file['link']);
// get the file name from the url. You can use random name too.
$url_parts_array = explode('/' , $file['link']);
/* assuming the image url as http:// abc . com/images/myimage.png , if we explode the string by /, the last element of the exploded array would have the filename */
$filename = $url_parts_array[count($url_parts_array) - 1];
// save fetched image
file_put_contents($savepath.$filename , $content);
// did the image save?
if(file_exists($savepath.$file['link']))
{
// yes? Okay, let's save the status
$query = $DBH->prepare("update images set status = 'fetched' WHERE id = ".$file['id']);
// output the name of the file that just got downloaded
echo $file['link']; echo '<br/>';
$query->execute();
}
}
// function definition get_url_content()
function get_url_content($url){
// ummm let's make our bot look like human
$agent= 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322)';
$ch = curl_init();
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_VERBOSE, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_URL,$url);
return curl_exec($ch);
}
//reload enabled? Reload!
if($reload)
echo '<script>location.reload(true);</script>';