Php 从pdf文档中读取和计算单词
我一直在做这个不同文件扩展名的文本提取项目, 但是我对pdf和powerpoint最痛苦,这里是pdf的代码 这里的任何人都知道如何使用任何工具或库tcpdf、xpdf或fpdfi从现有pdf文档中读取文本,因为我还没有看到任何从pdf或ppt读取文本的精确解决方案,但请不要使用zend解决方案Php 从pdf文档中读取和计算单词,php,pdf,powerpoint,Php,Pdf,Powerpoint,我一直在做这个不同文件扩展名的文本提取项目, 但是我对pdf和powerpoint最痛苦,这里是pdf的代码 这里的任何人都知道如何使用任何工具或库tcpdf、xpdf或fpdfi从现有pdf文档中读取文本,因为我还没有看到任何从pdf或ppt读取文本的精确解决方案,但请不要使用zend解决方案 function pdf2txt($filename){ $data = getFileData($filename); // grab objects and then grab
function pdf2txt($filename){
$data = getFileData($filename);
// grab objects and then grab their contents (chunks)
$a_obj = getDataArray($data,"obj","endobj");
foreach($a_obj as $obj){
$a_filter = getDataArray($obj,"<<",">>");
if (is_array($a_filter)){
$j++;
$a_chunks[$j]["filter"] = $a_filter[0];
$a_data = getDataArray($obj,"stream\r\n","endstream");
if (is_array($a_data)){
$a_chunks[$j]["data"] = substr($a_data[0],strlen("stream\r\n"),strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
}
}
}
// decode the chunks
foreach($a_chunks as $chunk){
// look at each chunk and decide how to decode it - by looking at the contents of the filter
$a_filter = split("/",$chunk["filter"]);
if ($chunk["data"]!=""){
// look at the filter to find out which encoding has been used
if (substr($chunk["filter"],"FlateDecode")!==false){
$data =@ gzuncompress($chunk["data"]);
if (trim($data)!=""){
$result_data .= ps2txt($data);
} else {
//$result_data .= "x";
}
}
}
}
return $result_data;
}
// Function : ps2txt()
// Arguments : $ps_data - postscript data you want to convert to plain text
// Description : Does a very basic parse of postscript data to
// : return the plain text
// Author : Jonathan Beckett, 2005-05-02
function ps2txt($ps_data){
$result = "";
$a_data = getDataArray($ps_data,"[","]");
if (is_array($a_data)){
foreach ($a_data as $ps_text){
$a_text = getDataArray($ps_text,"(",")");
if (is_array($a_text)){
foreach ($a_text as $text){
$result .= substr($text,1,strlen($text)-2);
}
}
}
} else {
// the data may just be in raw format (outside of [] tags)
$a_text = getDataArray($ps_data,"(",")");
if (is_array($a_text)){
foreach ($a_text as $text){
$result .= substr($text,1,strlen($text)-2);
}
}
}
return $result;
}
// Function : getFileData()
// Arguments : $filename - filename you want to load
// Description : Reads data from a file into a variable
// and passes that data back
// Author : Jonathan Beckett, 2005-05-02
function getFileData($filename){
$handle = fopen($filename,"rb");
$data = fread($handle, filesize($filename));
fclose($handle);
return $data;
}
// Function : getDataArray()
// Arguments : $data - data you want to chop up
// $start_word - delimiting characters at start of each chunk
// $end_word - delimiting characters at end of each chunk
// Description : Loop through an array of data and put all chunks
// between start_word and end_word in an array
// Author : Jonathan Beckett, 2005-05-02
function getDataArray($data,$start_word,$end_word){
$start = 0;
$end = 0;
unset($a_result);
while ($start!==false && $end!==false){
$start = strpos($data,$start_word,$end);
if ($start!==false){
$end = strpos($data,$end_word,$start);
if ($end!==false){
// data is between start and end
$a_result[] = substr($data,$start,$end-$start+strlen($end_word));
}
}
}
return $a_result;
}
this one is for powerpoint i found here some where but that isnt working also
function parsePPT($filename) {
// This approach uses detection of the string "chr(0f).Hex_value.chr(0x00).chr(0x00).chr(0x00)" to find text strings, which are then terminated by another NUL chr(0x00). [1] Get text between delimiters [2]
$fileHandle = fopen($filename, "r");
$line = @fread($fileHandle, filesize($filename));
$lines = explode(chr(0x0f),$line);
$outtext = '';
foreach($lines as $thisline) {
if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) {
$text_line = substr($thisline, 4);
$end_pos = strpos($text_line, chr(0x00));
$text_line = substr($text_line, 0, $end_pos);
$text_line = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$text_line);
if(substr($text_line,0,20)!="Click to edit Master")
if (strlen($text_line) > 1) {
$outtext.= substr($text_line, 0, $end_pos)."\n<br>";
}
}
}
return $outtext;
}
函数pdf2txt($filename){
$data=getFileData($filename);
//抓取对象,然后抓取其内容(块)
$a_obj=getDataArray($data,“obj”,“endobj”);
foreach($a_obj作为$obj){
$a_filter=getDataArray($obj,“”);
if(is_数组($a_过滤器)){
$j++;
$a_块[$j][“过滤器”]=$a_过滤器[0];
$a_data=getDataArray($obj,“stream\r\n”,“endstream”);
if(is_数组($a_数据)){
$a_chunks[$j][“data”]=substr($a_data[0],strlen(“stream\r\n”),strlen($a_data[0])-strlen(“stream\r\n”)-strlen(“endstream”);
}
}
}
//解码块
foreach($a_chunk作为$chunk){
//查看每个块并决定如何解码-通过查看过滤器的内容
$a_filter=split(“/”,$chunk[“filter”]);
如果($chunk[“data”]!=“”){
//查看过滤器以找出使用了哪种编码
if(substr($chunk[“filter”],“flateCode”)!==false){
$data=@gzuncompress($chunk[“data”]);
如果(修剪($data)!=“”){
$result_data.=ps2text($data);
}否则{
//$result_data.=“x”;
}
}
}
}
返回$result\u数据;
}
//函数:ps2text()
//参数:$ps_data-要转换为纯文本的postscript数据
//描述:对postscript数据进行非常基本的解析,以
//:返回纯文本
//作者:乔纳森·贝克特,2005-05-02
函数ps2text($ps\u数据){
$result=“”;
$a_data=getDataArray($ps_data,“[”,“]”);
if(is_数组($a_数据)){
foreach($a\u数据作为$ps\u文本){
$a_text=getDataArray($ps_text,“(”,”);
if(is_数组($a_文本)){
foreach($a_text作为$text){
$result.=substr($text,1,strlen($text)-2);
}
}
}
}否则{
//数据可能只是原始格式(在[]标记之外)
$a_text=getDataArray($ps_data,“(”,”);
if(is_数组($a_文本)){
foreach($a_text作为$text){
$result.=substr($text,1,strlen($text)-2);
}
}
}
返回$result;
}
//函数:getFileData()
//参数:$filename-要加载的文件名
//描述:将文件中的数据读入变量
//然后把数据传回去
//作者:乔纳森·贝克特,2005-05-02
函数getFileData($filename){
$handle=fopen($filename,“rb”);
$data=fread($handle,filesize($filename));
fclose($handle);
返回$data;
}
//函数:getDataArray()
//参数:$data-要切碎的数据
//$start_word-在每个块的开头分隔字符
//$end_word-在每个块的末尾分隔字符
//描述:循环遍历数据数组并放置所有块
//在数组中的起始字和结束字之间
//作者:乔纳森·贝克特,2005-05-02
函数getDataArray($data、$start\u word、$end\u word){
$start=0;
$end=0;
未设置($a_结果);
while($start!==false&&$end!==false){
$start=strpos($data,$start\u word,$end);
如果($start!==false){
$end=strpos($data,$end\u word,$start);
如果($end!==false){
//数据介于开始和结束之间
$a_result[]=substr($data,$start,$end-$start+strlen($end_word));
}
}
}
返回$a_结果;
}
这是我在这里找到的powerpoint,但它也不起作用
函数parsePPT($filename){
//这种方法使用对字符串“chr(0f).Hex_value.chr(0x00).chr(0x00).chr(0x00)”的检测来查找文本字符串,然后这些字符串被另一个NUL chr(0x00)终止。[1]获取分隔符之间的文本[2]
$fileHandle=fopen($filename,“r”);
$line=@fread($fileHandle,filesize($filename));
$lines=分解(chr(0x0f),$line);
$outtext='';
foreach($行作为$thisline){
if(strpos($thisline,chr(0x00).chr(0x00).chr(0x00))=1{
$text_line=substr($thisline,4);
$end_pos=strpos($text_行,chr(0x00));
$text\u line=substr($text\u line,0,$end\u pos);
$text\u line=preg\u replace(“/[^a-zA-Z0-9\s\,\.-\n\r\t@/\\\(\)]/”,“,”,$text\u line);
如果(substr($text_line,0,20)!=“单击以编辑主控形状”)
如果(strlen($text_line)>1){
$outtext.=substr($text\u line,0,$end\u pos)。“\n”; } } } 返回$outtext; }
你为什么要重新发明轮子?您可以使用ie.xpdf或类似工具提取PDF中的文本数据,然后处理该操作产生的纯文本文件。几乎任何包含文本的文件格式都可以使用相同的方法(即,首先转换为纯文本版本,然后再进行处理) 如果你选择这个解决方案,这可能是一本有趣的书