Perl解析csv文件并迭代curl_Csv_Perl_Parsing_Curl

Perl解析csv文件并迭代curl

csv perl parsing curl

Perl解析csv文件并迭代curl,csv,perl,parsing,curl,Csv,Perl,Parsing,Curl,我试图解析一个csv文件，并使用curl对其进行迭代。以下是我的数据集：我遵循了这个Stackoverflow问题：基本上为我的数据集创建哈希。这是我的密码： #!/usr/bin/perl use strict; use warnings; use Text::CSV_XS; use IO::File; use WWW::Curl::Easy; my $url = "https://elibrary.judiciary.gov.ph/thebookshelf/docmont

我试图解析一个csv文件，并使用curl对其进行迭代。以下是我的数据集：

我遵循了这个Stackoverflow问题：基本上为我的数据集创建哈希。这是我的密码：

#!/usr/bin/perl
use strict;
use warnings;

use Text::CSV_XS;
use IO::File;

use WWW::Curl::Easy;

my $url = "https://elibrary.judiciary.gov.ph/thebookshelf/docmonth/";
#my $filestoprocess = 'list_acts.csv';

# Usage example:
my $hash_ref = csv_file_hashref('toharvest_og_sourcing.csv');

foreach my $key (sort keys %{$hash_ref}){

my $urlcomplete = "$url"."@{$hash_ref->{$key}}";
   
#start the curl
my $user_agent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20140319 Firefox/24.0 Iceweasel/24.4.0";

my $curl = WWW::Curl::Easy->new;

$curl->setopt(CURLOPT_HEADER,1);
$curl->setopt(CURLOPT_USERAGENT, $user_agent);
$curl->setopt(CURLOPT_FOLLOWLOCATION, 1);
#$curl->setopt(CURLOPT_SSL_VERIFYPEER, 1L);
#$curl->curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
$curl->setopt(CURLOPT_SSL_VERIFYPEER, 0);
$curl->setopt(CURLOPT_URL, $urlcomplete);

# A filehandle, reference to a scalar or reference to a typeglob can be used here.
my $response_body;
$curl->setopt(CURLOPT_WRITEDATA,\$response_body);

# Starts the actual request
my $retcode = $curl->perform;

# Looking at the results...
    if ($retcode == 0) {
        my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
  my $curledurldate = $response_body;
  our ($issuancelink) = $curledurldate =~ /a href='(https.*?)'>.*?<STRONG>$key/s;
  #print "$issuancelink\n";

        if (defined $issuancelink) {

my $user_agent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20140319 Firefox/24.0 Iceweasel/24.4.0";

#my $curl = WWW::Curl::Easy->new;

$curl->setopt(CURLOPT_HEADER,1);
$curl->setopt(CURLOPT_USERAGENT, $user_agent);
$curl->setopt(CURLOPT_FOLLOWLOCATION, 1);
#$curl->setopt(CURLOPT_SSL_VERIFYPEER, 1L);
#$curl->curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
$curl->setopt(CURLOPT_SSL_VERIFYPEER, 0);

$curl->setopt(CURLOPT_URL, $issuancelink);

# A filehandle, reference to a scalar or reference to a typeglob can be used here.
my $response_body;
$curl->setopt(CURLOPT_WRITEDATA,\$response_body);

# Starts the actual request
my $retcode = $curl->perform;

# Looking at the results...
if ($retcode == 0) {
#       print("Transfer went ok\n");
        my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
      my $curledsource = $response_body;
      our ($ogsourcing) = $curledsource =~ /<br>\s+(\w+.*?)\s+?<CENTER>.*?H2/s;
    
        my $filename = 'ogsourcingharvested.txt';
              open (FH, '>>', $filename) or die("Could not open file. $!");
                #print "Error processing ".$fh."$_\n";
                                print FH $ogsourcing."|"."{$key}\n";
               close (FH);       
        }

        else {
        # Error code, type of error, error message
        print("An error happened: $retcode ".$curl->strerror($retcode)." ".$curl->errbuf."\n");

        }
} else {
        # Error code, type of error, error message
        print("An error happened: $retcode ".$curl->strerror($retcode)." ".$curl->errbuf."\n");
}

}
}

# Implementation:
sub csv_file_hashref {
   my ($filename) = @_;

   my $csv_fh = IO::File->new($filename, 'r');
   my $csv = Text::CSV_XS->new ();

   my %output_hash;

   while(my $colref = $csv->getline ($csv_fh))
   {
      $output_hash{shift @{$colref}} = $colref;
   }

   return \%output_hash;
}

#/usr/bin/perl
严格使用；
使用警告；
使用Text:：csvxs；
使用IO：：文件；
使用WWW：：Curl：：Easy；
我的$url=”https://elibrary.judiciary.gov.ph/thebookshelf/docmonth/";
#my$filestoprocess='list_acts.csv'；
#用法示例：
我的$hash_ref=csv_file_hashref（'toharvest_og_sourcing.csv'）；
foreach my$key（排序键%{$hash\u ref}）{
my$urlcomplete=“$url”。@{$hash_ref->{$key}”；
#开始卷曲
my$user_agent=“Mozilla/5.0（X11；Linux i686；rv:24.0）Gecko/20140319 Firefox/24.0 Iceweasel/24.4.0”；
my$curl=WWW:：curl:：Easy->new；
$curl->setopt（CURLOPT_头，1）；
$curl->setopt（CURLOPT\u USERAGENT，$user\u agent）；
$curl->setopt（CURLOPT_FOLLOWLOCATION，1）；
#$curl->setopt（CURLOPT\u SSL\u VERIFYPEER，1L）；
#$curl->curl\u easy\u setopt（curl，CURLOPT\u SSL\u VERIFYPEER，1L）；
$curl->setopt（CURLOPT\u SSL\u VERIFYPEER，0）；
$curl->setopt（CURLOPT_URL，$urlcomplete）；
#这里可以使用文件句柄、标量引用或typeglob引用。
我的身体；
$curl->setopt（CURLOPT\u WRITEDATA，\$response\u body）；
#启动实际请求
my$retcode=$curl->perform；
#看看结果。。。
如果（$retcode==0）{
我的$response\u code=$curl->getinfo（CURLINFO\u HTTP\u code）；
my$curledurldate=$response\u body；
我们的（$issuancelink）=$curledurldate=~/a href='（https.*？）>.*？$key/s；
#打印“$issuancelink\n”；
如果（定义为$issuancelink）{
my$user_agent=“Mozilla/5.0（X11；Linux i686；rv:24.0）Gecko/20140319 Firefox/24.0 Iceweasel/24.4.0”；
#my$curl=WWW:：curl:：Easy->new；
$curl->setopt（CURLOPT_头，1）；
$curl->setopt（CURLOPT\u USERAGENT，$user\u agent）；
$curl->setopt（CURLOPT_FOLLOWLOCATION，1）；
#$curl->setopt（CURLOPT\u SSL\u VERIFYPEER，1L）；
#$curl->curl\u easy\u setopt（curl，CURLOPT\u SSL\u VERIFYPEER，1L）；
$curl->setopt（CURLOPT\u SSL\u VERIFYPEER，0）；
$curl->setopt（CURLOPT_URL，$issuancelink）；
#这里可以使用文件句柄、标量引用或typeglob引用。
我的身体；
$curl->setopt（CURLOPT\u WRITEDATA，\$response\u body）；
#启动实际请求
my$retcode=$curl->perform；
#看看结果。。。
如果（$retcode==0）{
#打印（“传输正常”\n）；
我的$response\u code=$curl->getinfo（CURLINFO\u HTTP\u code）；
my$curledsource=$response\u body；
我们的（$ogsourcing）=$curledsource=~/
\s+（\w+.*？\s+.*H2/s；
我的$filename='ogsourcingharved.txt'；
打开（FH，“>>”，$filename）或死亡（“无法打开文件。$！”；
#打印“错误处理”。$fh。“$\n”；
打印FH$ogsourcing.|“{$key}\n”；
关闭（FH）；
}
否则{
#错误代码、错误类型、错误消息
打印（“发生错误：$retcode”。$curl->strerror（$retcode）。”。$curl->errbuf。“\n”）；
}
}否则{
#错误代码、错误类型、错误消息
打印（“发生错误：$retcode”。$curl->strerror（$retcode）。”。$curl->errbuf。“\n”）；
}
}
}
#实施：
子csv\u文件\u hashref{
我的（$filename）=@；
我的$csv_fh=IO:：File->new（$filename，'r'）；
我的$csv=Text:：csv_XS->new（）；
我的%output\u散列；
而（my$colref=$csv->getline（$csv\u fh））
{
$output_hash{shift@{$colref}}=$colref；
}
返回\%output\u散列；
}

基本上，代码遍历第二列，将其添加到URL的末尾，然后该URL被卷曲。然后，在卷曲URL的内容中搜索特定内容：

our ($issuancelink) = $curledurldate =~ /a href='(https.*?)'>.*?<STRONG>$key/s;

our（$issuancelink）=$curledurldate=~/a href='（https.*？）>.*？$key/s；
当该链接出现在搜索中时，它被放入变量（$issuancelink），然后该变量$issuancelink被卷曲。然后搜索卷曲文件中的特定文本，然后捕获该特定文本并保存到文本文件中。但是，如果第二列（本例中为9月/1900/28日、10月/1900/28日）不重复，则我的代码是好的。然而，如果它重复出现，那就是我遇到问题的地方，似乎第一次迭代就是被捕获的那个。所以在我的例子中，第3幕的链接与第2幕有相同的原始URL（），而第2幕的链接是被捕获的。提前谢谢
但是，如果第二列（本例中为9月/1900/28日、10月/1900/28日）不重复，则我的代码是好的
在散列中存储值时，散列键是唯一的。这意味着，如果您有相同的密钥名，它们将相互覆盖
这部分代码：

while(my $colref = $csv->getline ($csv_fh)) { $output_hash{shift @{$colref}} = $colref; }
似乎是有责任的。您可以做的是将值保存在数组中，而不是标量中（在本例中，保存在数组ref中）
我会这样做：

while(my $colref = $csv->getline ($csv_fh)) { my ($key, $value) = @$colref; push @{$output_hash{$key}}, $value; # store values in array }
这样做的另一个好处是复制值。在代码中，数组ref被复制。变量
my$colref
的有限范围可以避免出现问题，但一般来说，复制这些值可以避免出现问题
要访问数组值，可能需要循环每个哈希键。差不多

for my $key (sort keys %$hash_ref) { for my $values (@{$hash_ref{$key}}) { # do stuff...

这是很多代码。请你的问题和格式正确，这是很难阅读。嗨TLP！谢谢你的回答。我正在尝试您的代码，我得到了这个错误：全局符号“%hash\u ref”需要显式的包名（您是否忘记在test2.pl第17行声明“my%hash\u ref”？）。我不知道用什么替换%hash_ref。@schnydszch好吧，这很简单。您试图使用未声明的哈希变量。可能您声明了
my$hash\u ref
，并尝试使用
$hash\u ref{foo}
。后者指散列
%hash\u ref
中的散列值。您需要做的是使用
$hash_ref->{foo}
，这是访问hash ref的正确方法。
for my $key (sort keys %$hash_ref) { for my $values (@{$hash_ref{$key}}) { # do stuff...