Php 在使用DomDocument解析html时,是否有方法保持实体完整?
我有这个功能来确保每个img标签都有绝对URL:Php 在使用DomDocument解析html时,是否有方法保持实体完整?,php,domdocument,entities,Php,Domdocument,Entities,我有这个功能来确保每个img标签都有绝对URL: function absoluteSrc($html, $encoding = 'utf-8') { $dom = new DOMDocument(); // Workaround to use proper encoding $prehtml = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; char
function absoluteSrc($html, $encoding = 'utf-8')
{
$dom = new DOMDocument();
// Workaround to use proper encoding
$prehtml = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset={$encoding}\"></head><body>";
$posthtml = "</body></html>";
if($dom->loadHTML( $prehtml . trim($html) . $posthtml)){
foreach($dom->getElementsByTagName('img') as $img){
if($img instanceof DOMElement){
$src = $img->getAttribute('src');
if( strpos($src, 'http://') !== 0 ){
$img->setAttribute('src', 'http://my.server/' . $src);
}
}
}
$html = $dom->saveHTML();
// Remove remains of workaround / DomDocument additions
$cut_start = strpos($html, '<body>') + 6;
$cut_length = -1 * (1+strlen($posthtml));
$html = substr($html, $cut_start, $cut_length);
}
return $html;
}
function absoluteSrc($html,$encoding='utf-8')
{
$dom=新的DOMDocument();
//使用正确编码的变通方法
$prehtml=“”;
$posthtml=“”;
if($dom->loadHTML($prehtml.trim($html)。$posthtml)){
foreach($dom->getElementsByTagName('img')作为$img){
如果($img-instanceof-domeElement){
$src=$img->getAttribute('src');
if(strpos($src,'http://')!==0){
$img->setAttribute('src','http://my.server/“.$src);
}
}
}
$html=$dom->saveHTML();
//删除剩余的解决方法/DOM文档添加
$cut_start=strpos($html,')+6;
$cut_length=-1*(1+strlen($posthtml));
$html=substr($html,$cut\u start,$cut\u length);
}
返回$html;
}
它工作正常,但它将解码的实体作为unicode字符返回
$html = <<< EOHTML
<p><img src="images/lorem.jpg" alt="lorem" align="left">
Lorem ipsum dolor sit amet consectetuer Nullam felis laoreet
Cum magna. Suscipit sed vel tincidunt urna.<br>
Vel consequat pretium Curabitur faucibus justo adipiscing elit.
<img src="others/ipsum.png" alt="ipsum" align="right"></p>
<center>© Dr Jekyll & Mr Hyde</center>
EOHTML;
echo absoluteSrc($html);
$html=我也想知道这个问题的答案
我最终转变了&。。;在解析之前将实体转换为**ENTITY-…-ENTITY**
,并在解析完成后将其转换回。以下代码似乎有效
$dom= new DOMDocument('1.0', 'UTF-8');
$dom->loadHTML($this->htmlentities2stringcode(rawurldecode($content)) );
$dom->preserveWhiteSpace = true;
$innerHTML = str_replace("<html></html><html><body>", "",
str_replace("</body></html>", "",
str_replace("+","%2B",str_replace("<p></p>", "", $this->getInnerHTML( $dom )))));
return $this->stringcode2htmlentities($innerHTML));
}
// ----------------------------------------------------------
function htmlentities2stringcode($string) {
// This method will convert htmlentities such as © into the pseudo string version ^copy^; etc
$from = array_keys($this->getHTMLEntityStringCodeArray());
$to = array_values($this->getHTMLEntityStringCodeArray());
return str_replace($from, $to, $string);
}
// ----------------------------------------------------------
function stringcode2htmlentities ($string) {
// This method will convert pseudo string such as ^copy^ to the original html entity © etc
$from = array_values($this->getHTMLEntityStringCodeArray());
$to = array_keys($this->getHTMLEntityStringCodeArray());
return str_replace($from, $to, $string);
}
// -------------------------------------------------------------
function getHTMLEntityStringCodeArray() {
return array('Α'=>'^Alpha^',
'Β'=>'^Beta^',
'Χ'=>'^Chi^',
'‡'=>'^Dagger^',
'Δ'=>'^Delta^',
'Ε'=>'^Epsilon^',
'Η'=>'^Eta^',
'Γ'=>'^Gamma^',
'Ι'=>'^lota^',
'Κ'=>'^Kappa^',
'Λ'=>'^Lambda^',
'Μ'=>'^Mu^',
'Ν'=>'^Nu^',
'Œ'=>'^OElig^',
'Ω'=>'^Omega^',
'Ο'=>'^Omicron^',
'Φ'=>'^Phi^',
'Π'=>'^Pi^',
'″'=>'^Prime^',
'Ψ'=>'^Psi^',
'Ρ'=>'^Rho^',
'Š'=>'^Scaron^',
'Š'=>'^Scaron^',
'Σ'=>'^Sigma^',
'Τ'=>'^Tau^',
'Θ'=>'^Theta^',
'Υ'=>'^Upsilon^',
'Ξ'=>'^Xi^',
'Ÿ'=>'^Yuml^',
'Ζ'=>'^Zeta^',
'ℵ'=>'^alefsym^',
'α'=>'^alpha^',
'∧'=>'^and^',
'∠'=>'^ang^',
'≈'=>'^asymp^',
'„'=>'^bdquo^',
'β'=>'^beta^',
'•'=>'^bull^',
'∩'=>'^cap^',
'χ'=>'^chi^',
'ˆ'=>'^circ^',
'♣'=>'^clubs^',
'≅'=>'^cong^',
'↵'=>'^crarr^',
'∪'=>'^cup^',
'⇓'=>'^dArr^',
'†'=>'^dagger^',
'↓'=>'^darr^',
'δ'=>'^delta^',
'♦'=>'^diams^',
'∅'=>'^empty^',
' '=>'^emsp^',
' '=>'^ensp^',
'ε'=>'^epsilon^',
'≡'=>'^equiv^',
'η'=>'^eta^',
'€'=>'^euro^',
'∃'=>'^exist^',
'ƒ'=>'^fnof^',
'∀'=>'^forall^',
'⁄'=>'^frasl^',
'γ'=>'^gamma^',
'≥'=>'^ge^',
'⇔'=>'^hArr^',
'↔'=>'^harr^',
'♥'=>'^hearts^',
'…'=>'^hellip^',
'ℑ'=>'^image^',
'∞'=>'^infin^',
'∫'=>'^int^',
'ι'=>'^iota^',
'∈'=>'^isin^',
'κ'=>'^kappa^',
'⇐'=>'^lArr^',
'λ'=>'^lambda^',
'⟨'=>'^lang^',
'←'=>'^larr^',
'⌈'=>'^lceil^',
'“'=>'^ldquo^',
'≤'=>'^le^',
'⌊'=>'^lfloor^',
'∗'=>'^lowast^',
'◊'=>'^loz^',
'‎'=>'^lrm^',
'‹'=>'^lsaquo^',
'‘'=>'^lsquo^',
'—'=>'^mdash^',
'−'=>'^minus^',
'μ'=>'^mu^',
'∇'=>'^nabla^',
'–'=>'^ndash^',
'≠'=>'^ne^',
'∋'=>'^ni^',
'∉'=>'^notin^',
'⊄'=>'^nsub^',
'ν'=>'^nu^',
'œ'=>'^oelig^',
'‾'=>'^oline^',
'ω'=>'^omega^',
'ο'=>'^omicron^',
'⊕'=>'^oplus^',
'∨'=>'^or^',
'⊗'=>'^otimes^',
'∂'=>'^part^',
'‰'=>'^permil^',
'⊥'=>'^perp^',
'φ'=>'^phi^',
'π'=>'^pi^',
'ϖ'=>'^piv^',
'′'=>'^prime^',
'∏'=>'^prod^',
'∝'=>'^prop^',
'ψ'=>'^psi^',
'⇒'=>'^rArr^',
'√'=>'^radic^',
'⟩'=>'^rang^',
'→'=>'^rarr^',
'⌉'=>'^rceil^',
'”'=>'^rdquo^',
'ℜ'=>'^real^',
'⌋'=>'^rfloor^',
'ρ'=>'^rho^',
'‏'=>'^rlm^',
'›'=>'^rsaquo^',
'’'=>'^rsquo^',
'‚'=>'^sbquo^',
'š'=>'^scaron^',
'⋅'=>'^sdot^',
'σ'=>'^sigma^',
'ς'=>'^sigmaf^',
'∼'=>'^sim^',
'♠'=>'^spades^',
'⊂'=>'^sub^',
'⊆'=>'^sube^',
'∑'=>'^sum^',
'⊃'=>'^sup^',
'⊇'=>'^supe^',
'τ'=>'^tau^',
'∴'=>'^there4^',
'θ'=>'^thetasym^',
'ϑ'=>'^thetasym^',
' '=>'^thinsp^',
'˜'=>'^tilde^',
'™'=>'^trade^',
'⇑'=>'^uArr^',
'↑'=>'^uarr^',
'ϒ'=>'^upsih^',
'υ'=>'^upsilon^',
'℘'=>'^weierp^',
'ξ'=>'^xi^',
'ÿ'=>'^yuml^',
'ζ'=>'^zeta^',
'‍'=>'^zwj^',
'‌'=>'^zwnj^');
}
$dom=新的DOMDocument('1.0','UTF-8');
$dom->loadHTML($this->htmlentities2stringcode(rawurldecode($content));
$dom->preserveWhiteSpace=true;
$innerHTML=str_replace(“,”,
str_替换(“,”,
str_replace(“+”,“%2B”,str_replace(“”,“,$this->getInnerHTML($dom‘)!”);
返回$this->stringcode2htmlentities($innerHTML));
}
// ----------------------------------------------------------
函数htmlentities2stringcode($string){
//此方法将把诸如©;之类的HTML属性转换为伪字符串版本^copy^;等
$from=array_keys($this->getHTMLEntityStringCodeArray());
$to=array_值($this->getHTMLEntityStringCodeArray());
返回str_replace($from,$to,$string);
}
// ----------------------------------------------------------
函数stringcode2htmlentities($string){
//此方法将伪字符串(如^copy^)转换为原始html实体©;等
$from=array_值($this->getHTMLEntityStringCodeArray());
$to=array_key($this->getHTMLEntityStringCodeArray());
返回str_replace($from,$to,$string);
}
// -------------------------------------------------------------
函数getHTMLEntityStringCodeArray(){
返回数组(“&Alpha;”=>“^Alpha^”,
“&Beta;”=>“^Beta^”,
“&Chi;”=>“^Chi^”,
“&Dagger;”=>“^Dagger^”,
“&Delta;”=>“^Delta^”,
“&Epsilon;”=>“^Epsilon^”,
“&Eta;”=>“^Eta^”,
“&Gamma;”=>“^Gamma^”,
“&Iota;”=>“^lota^”,
“&Kappa;”=>“^Kappa^”,
“&Lambda;”=>“^Lambda^”,
“&Mu;”=>“^Mu^”,
“&Nu;”=>“^Nu^”,
“&OElig;”=>“^OElig^”,
“&Omega;”=>“^Omega^”,
“&Omicron;”=>“^Omicron^”,
“&Phi;”=>“^Phi^”,
“&Pi;”=>“^Pi^”,
“&Prime;”=>“^Prime^”,
“&Psi;”=>“^Psi^”,
“&Rho;”=>“^Rho^”,
“&Scaron;”=>“^Scaron^”,
“&Scaron;”=>“^Scaron^”,
“&Sigma;”=>“^Sigma^”,
“&Tau;”=>“^Tau^”,
“&Theta;”=>“^Theta^”,
“&Upsilon;”=>“^Upsilon^”,
“&Xi;”=>“^Xi^”,
“&Yuml;”=>“^Yuml^”,
“&Zeta;”=>“^Zeta^”,
“&alefsym;”=>“^alefsym^”,
“&alpha;”=>“^alpha^”,
“&和;”=>“^和^”,
“&ang;”=>“^ang^”,
“&asymp;”=>“^asymp^”,
“&bdquo;”=>“^bdquo^”,
“&beta;”=>“^beta^”,
“&bull;”=>“^bull^”,
“&cap;”=>“^cap^”,
“&chi;”=>“^chi^”,
“&circ;”=>“^circ^”,
“&clubs;”=>“^clubs^”,
“&cong;”=>“^cong^”,
“&crarr;”=>“^crarr^”,
“&cup;”=>“^cup^”,
“&dArr;”=>“^dArr^”,
“&dagger;”=>“^dagger^”,
“&darr;”=>“^darr^”,
“&delta;”=>“^delta^”,
“&diams;”=>“^diams^”,
“&empty;”=>“^empty^”,
“&emsp;”=>“^emsp^”,
“&ensp;”=>“^ensp^”,
“&epsilon;”=>“^epsilon^”,
“&equiv;”=>“^equiv^”,
“&eta;”=>“^eta^”,
“&euro;”=>“^euro^”,
“&exist;”=>“^exist^”,
“&fnof;”=>“^fnof^”,
“&forall;”=>“^forall^”,
$dom= new DOMDocument('1.0', 'UTF-8');
$dom->loadHTML($this->htmlentities2stringcode(rawurldecode($content)) );
$dom->preserveWhiteSpace = true;
$innerHTML = str_replace("<html></html><html><body>", "",
str_replace("</body></html>", "",
str_replace("+","%2B",str_replace("<p></p>", "", $this->getInnerHTML( $dom )))));
return $this->stringcode2htmlentities($innerHTML));
}
// ----------------------------------------------------------
function htmlentities2stringcode($string) {
// This method will convert htmlentities such as © into the pseudo string version ^copy^; etc
$from = array_keys($this->getHTMLEntityStringCodeArray());
$to = array_values($this->getHTMLEntityStringCodeArray());
return str_replace($from, $to, $string);
}
// ----------------------------------------------------------
function stringcode2htmlentities ($string) {
// This method will convert pseudo string such as ^copy^ to the original html entity © etc
$from = array_values($this->getHTMLEntityStringCodeArray());
$to = array_keys($this->getHTMLEntityStringCodeArray());
return str_replace($from, $to, $string);
}
// -------------------------------------------------------------
function getHTMLEntityStringCodeArray() {
return array('Α'=>'^Alpha^',
'Β'=>'^Beta^',
'Χ'=>'^Chi^',
'‡'=>'^Dagger^',
'Δ'=>'^Delta^',
'Ε'=>'^Epsilon^',
'Η'=>'^Eta^',
'Γ'=>'^Gamma^',
'Ι'=>'^lota^',
'Κ'=>'^Kappa^',
'Λ'=>'^Lambda^',
'Μ'=>'^Mu^',
'Ν'=>'^Nu^',
'Œ'=>'^OElig^',
'Ω'=>'^Omega^',
'Ο'=>'^Omicron^',
'Φ'=>'^Phi^',
'Π'=>'^Pi^',
'″'=>'^Prime^',
'Ψ'=>'^Psi^',
'Ρ'=>'^Rho^',
'Š'=>'^Scaron^',
'Š'=>'^Scaron^',
'Σ'=>'^Sigma^',
'Τ'=>'^Tau^',
'Θ'=>'^Theta^',
'Υ'=>'^Upsilon^',
'Ξ'=>'^Xi^',
'Ÿ'=>'^Yuml^',
'Ζ'=>'^Zeta^',
'ℵ'=>'^alefsym^',
'α'=>'^alpha^',
'∧'=>'^and^',
'∠'=>'^ang^',
'≈'=>'^asymp^',
'„'=>'^bdquo^',
'β'=>'^beta^',
'•'=>'^bull^',
'∩'=>'^cap^',
'χ'=>'^chi^',
'ˆ'=>'^circ^',
'♣'=>'^clubs^',
'≅'=>'^cong^',
'↵'=>'^crarr^',
'∪'=>'^cup^',
'⇓'=>'^dArr^',
'†'=>'^dagger^',
'↓'=>'^darr^',
'δ'=>'^delta^',
'♦'=>'^diams^',
'∅'=>'^empty^',
' '=>'^emsp^',
' '=>'^ensp^',
'ε'=>'^epsilon^',
'≡'=>'^equiv^',
'η'=>'^eta^',
'€'=>'^euro^',
'∃'=>'^exist^',
'ƒ'=>'^fnof^',
'∀'=>'^forall^',
'⁄'=>'^frasl^',
'γ'=>'^gamma^',
'≥'=>'^ge^',
'⇔'=>'^hArr^',
'↔'=>'^harr^',
'♥'=>'^hearts^',
'…'=>'^hellip^',
'ℑ'=>'^image^',
'∞'=>'^infin^',
'∫'=>'^int^',
'ι'=>'^iota^',
'∈'=>'^isin^',
'κ'=>'^kappa^',
'⇐'=>'^lArr^',
'λ'=>'^lambda^',
'⟨'=>'^lang^',
'←'=>'^larr^',
'⌈'=>'^lceil^',
'“'=>'^ldquo^',
'≤'=>'^le^',
'⌊'=>'^lfloor^',
'∗'=>'^lowast^',
'◊'=>'^loz^',
'‎'=>'^lrm^',
'‹'=>'^lsaquo^',
'‘'=>'^lsquo^',
'—'=>'^mdash^',
'−'=>'^minus^',
'μ'=>'^mu^',
'∇'=>'^nabla^',
'–'=>'^ndash^',
'≠'=>'^ne^',
'∋'=>'^ni^',
'∉'=>'^notin^',
'⊄'=>'^nsub^',
'ν'=>'^nu^',
'œ'=>'^oelig^',
'‾'=>'^oline^',
'ω'=>'^omega^',
'ο'=>'^omicron^',
'⊕'=>'^oplus^',
'∨'=>'^or^',
'⊗'=>'^otimes^',
'∂'=>'^part^',
'‰'=>'^permil^',
'⊥'=>'^perp^',
'φ'=>'^phi^',
'π'=>'^pi^',
'ϖ'=>'^piv^',
'′'=>'^prime^',
'∏'=>'^prod^',
'∝'=>'^prop^',
'ψ'=>'^psi^',
'⇒'=>'^rArr^',
'√'=>'^radic^',
'⟩'=>'^rang^',
'→'=>'^rarr^',
'⌉'=>'^rceil^',
'”'=>'^rdquo^',
'ℜ'=>'^real^',
'⌋'=>'^rfloor^',
'ρ'=>'^rho^',
'‏'=>'^rlm^',
'›'=>'^rsaquo^',
'’'=>'^rsquo^',
'‚'=>'^sbquo^',
'š'=>'^scaron^',
'⋅'=>'^sdot^',
'σ'=>'^sigma^',
'ς'=>'^sigmaf^',
'∼'=>'^sim^',
'♠'=>'^spades^',
'⊂'=>'^sub^',
'⊆'=>'^sube^',
'∑'=>'^sum^',
'⊃'=>'^sup^',
'⊇'=>'^supe^',
'τ'=>'^tau^',
'∴'=>'^there4^',
'θ'=>'^thetasym^',
'ϑ'=>'^thetasym^',
' '=>'^thinsp^',
'˜'=>'^tilde^',
'™'=>'^trade^',
'⇑'=>'^uArr^',
'↑'=>'^uarr^',
'ϒ'=>'^upsih^',
'υ'=>'^upsilon^',
'℘'=>'^weierp^',
'ξ'=>'^xi^',
'ÿ'=>'^yuml^',
'ζ'=>'^zeta^',
'‍'=>'^zwj^',
'‌'=>'^zwnj^');
}