Wordpress ApacheTika正在索引HTTP响应,而不是文档内容
我正在使用Solr8.3和Tika为Wordpress(版本4.9.7)的内容和附件编制索引。Solr和Wordpress服务器位于公司的同一内部网络中。由于组织的决定,我没有使用WP-Solr和其他插件(它们都足够好) 我编写了data-config.xml和管理的模式文件,并将它们上传到Zookeeper。这些文件在Solr管理界面中更新。因此,我创建了一个名为wp的新集合,并为一些文件编制了索引(在Solr管理界面中,我将范围设置为0到200) 因此,当我查询内容时,元字段被正确地索引,但是conteudo_文本和文本字段返回301 HTTP响应(下面的示例): My data-config.xml:Wordpress ApacheTika正在索引HTTP响应,而不是文档内容,wordpress,solr,lucene,apache-tika,Wordpress,Solr,Lucene,Apache Tika,我正在使用Solr8.3和Tika为Wordpress(版本4.9.7)的内容和附件编制索引。Solr和Wordpress服务器位于公司的同一内部网络中。由于组织的决定,我没有使用WP-Solr和其他插件(它们都足够好) 我编写了data-config.xml和管理的模式文件,并将它们上传到Zookeeper。这些文件在Solr管理界面中更新。因此,我创建了一个名为wp的新集合,并为一些文件编制了索引(在Solr管理界面中,我将范围设置为0到200) 因此,当我查询内容时,元字段被正确地索引,但
<dataConfig>
<dataSource
type="JdbcDataSource"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://mysql-grid-homol.tjrs.gov.br:3306/wordpress"
user="usr"
password="pwd"
name="wpdb"
batchSize="-1"
readOnly="true"
/>
<dataSource
type="BinURLDataSource"
name="url_doc"
/>
<document name="docs">
<entity
dataSource="wpdb"
name="wp"
pk="ID"
query="
SELECT
post.Id ID,
post_title TITULO,
IF (post_content = '', post_title, post_content) CONTEUDO,
CONCAT
(
DATE_FORMAT(post.Post_date, '%Y-%m-%d'),
'T',
DATE_FORMAT(post.Post_date, '%H:%i:%s'),
'Z'
) DATA_PUBLICACAO,
CONCAT
(
DATE_FORMAT(post.Post_modified, '%Y-%m-%d'),
'T',
DATE_FORMAT(post.Post_modified, '%H:%i:%s'),
'Z'
)DATA_ALTERACAO,
CONCAT
(
'http:',
guid
) URL
FROM
wpw_posts post
LEFT JOIN wpw_postmeta postmeta
ON (postmeta.Post_id = post.Id AND postmeta.Meta_key = 'publico')
WHERE
post.Post_type IN ('page', 'noticia', 'evento', 'curso', 'sistema', 'classificado', 'discurso', 'attachment')
AND post.post_status = 'inherit'
AND post.post_mime_type like 'application%'
ORDER BY post.Post_date DESC
"
>
<field column="ID" name="id"/>
<field column="TITULO" name="titulo"/>
<field column="CONTEUDO" name="conteudo"/>
<field column="DATA_PUBLICACAO" name="data_publicacao" dateTimeFormat="DD/MM/YYYY'T'hh:mm:ss"/>
<field column="DATA_ALTERACAO" name="data_alteracao" dateTimeFormat="DD/MM/YYYY'T'hh:mm:ss"/>
<field column="URL" name="url"/>
<entity
name="arquivo"
dataSource="url_doc"
processor="TikaEntityProcessor"
url="${wp.URL}"
format="text"
onError="continue"
extractEmbedded="true"
>
<field column="text" name="conteudo_text" />
</entity>
</entity>
</document>
</dataConfig>
我的托管架构:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="v2" version="1.6">
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<uniqueKey>id</uniqueKey>
<field name="titulo" type="string" indexed="true" stored="true" required="true" />
<field name="conteudo" type="string" indexed="true" stored="true" required="true" />
<field name="data_publicacao" type="date" indexed="true" stored="true" docValues="true"/>
<field name="data_alteracao" type="date" indexed="true" stored="true" docValues="true" />
<field name="url" type="string" indexed="true" stored="true" required="true" multiValued="false"/>
<field name="conteudo_text" type="text" indexed="true" stored="true" required="true" multiValued="true" default=" "/>
<field name="text" type="sem_aspas" indexed="true" stored="true" required="true" multiValued="true"/>
<field name="_version_" type="long" indexed="false" stored="false" />
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
<field name="_text_" type="sem_aspas" indexed="true" stored="true" multiValued="true"/>
<!-- primitive types -->
<fieldType name="integer" class="solr.IntPointField" docValues="true"/>
<fieldType name="integers" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="long" class="solr.LongPointField" docValues="true"/>
<fieldType name="longs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" docValues="true" multiValued="true"/>
<fieldType name="date" class="solr.DatePointField" docValues="true"/>
<fieldType name="dates" class="solr.DatePointField" docValues="true" multiValued="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="float" class="solr.FloatPointField" docValues="true" multiValued="false"/>
<fieldType name="floats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="double" class="solr.DoublePointField" docValues="true" multiValued="false"/>
<fieldType name="doubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="binary" class="solr.BinaryField"/>
<copyField source="conteudo_text" dest="_text_" />
<fieldType name="sem_aspas" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" format="snowball" />
<filter class="solr.BrazilianStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" format="snowball" />
<filter class="solr.BrazilianStemFilterFactory"/>
<filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
</analyzer>
</fieldType>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
</schema>
身份证件
我试图解决问题的事情:
1) 从BinURLDataSource更改为URLDataSource或FieldStreamDataSource;
2) 在BinURLDataSource定义中包括具有访问文件权限的用户和密码
我是Solr/Lucene和Tika技术的新用户(仅限我的第二个项目),欢迎提供任何帮助
注意。您正在为响应编制索引-这是一个301/302重定向,您正在为响应中的文本编制索引,而不是按照重定向。阅读配置文件后,我不确定您是如何得到文本而不是响应的,但很明显,您在对站点进行爬网后,磁盘上有了这些文件?谢谢您的回答,MatsLindh。附件文件位于Wordpress上的/static文件夹中,并且受密码保护(这就是为什么我更喜欢通过URLDataSource而不是FileStreamDataSource进行访问)。如果我选择Solr索引的URL并在浏览器中打开,则会发生重定向到最终路径的情况,从而正确打开文档。您正在为响应编制索引-这是一个301/302重定向,并且您正在为响应中的文本编制索引,而不是按照重定向。阅读配置文件后,我不确定您是如何得到文本而不是响应的,但很明显,您在对站点进行爬网后,磁盘上有了这些文件?谢谢您的回答,MatsLindh。附件文件位于Wordpress上的/static文件夹中,并且受密码保护(这就是为什么我更喜欢通过URLDataSource而不是FileStreamDataSource进行访问)。如果我选择Solr索引的URL并在浏览器中打开,会发生重定向到最终路径的情况,从而正确打开文档。
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="v2" version="1.6">
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<uniqueKey>id</uniqueKey>
<field name="titulo" type="string" indexed="true" stored="true" required="true" />
<field name="conteudo" type="string" indexed="true" stored="true" required="true" />
<field name="data_publicacao" type="date" indexed="true" stored="true" docValues="true"/>
<field name="data_alteracao" type="date" indexed="true" stored="true" docValues="true" />
<field name="url" type="string" indexed="true" stored="true" required="true" multiValued="false"/>
<field name="conteudo_text" type="text" indexed="true" stored="true" required="true" multiValued="true" default=" "/>
<field name="text" type="sem_aspas" indexed="true" stored="true" required="true" multiValued="true"/>
<field name="_version_" type="long" indexed="false" stored="false" />
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
<field name="_text_" type="sem_aspas" indexed="true" stored="true" multiValued="true"/>
<!-- primitive types -->
<fieldType name="integer" class="solr.IntPointField" docValues="true"/>
<fieldType name="integers" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="long" class="solr.LongPointField" docValues="true"/>
<fieldType name="longs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" docValues="true" multiValued="true"/>
<fieldType name="date" class="solr.DatePointField" docValues="true"/>
<fieldType name="dates" class="solr.DatePointField" docValues="true" multiValued="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="float" class="solr.FloatPointField" docValues="true" multiValued="false"/>
<fieldType name="floats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="double" class="solr.DoublePointField" docValues="true" multiValued="false"/>
<fieldType name="doubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="binary" class="solr.BinaryField"/>
<copyField source="conteudo_text" dest="_text_" />
<fieldType name="sem_aspas" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" format="snowball" />
<filter class="solr.BrazilianStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" format="snowball" />
<filter class="solr.BrazilianStemFilterFactory"/>
<filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
</analyzer>
</fieldType>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
</schema>