带SOLR的NCrawler_Solr_Web Crawler

带SOLR的NCrawler

solr web-crawler

带SOLR的NCrawler,solr,web-crawler,Solr,Web Crawler,我设法用NCrawler抓取了一个网站。是否可以将该数据导入SOLR，以便我可以使用SOLR中的索引数据进行搜索如果可能的话，我如何将爬网的数据推送到SOLR？任何帮助都会非常感激提前感谢。是的，可以将爬网数据索引到Solr。我以前也这样做过。您需要创建一个实现IPipelineStep的自定义管道步骤，并将其添加到NCrawler实现中。我用作连接到Solr的客户端这里有一些代码可以帮助您开始 SolrNet.Startup.Init<IndexItem>("http://

我设法用NCrawler抓取了一个网站。是否可以将该数据导入SOLR，以便我可以使用SOLR中的索引数据进行搜索

如果可能的话，我如何将爬网的数据推送到SOLR？任何帮助都会非常感激

提前感谢。

是的，可以将爬网数据索引到Solr。我以前也这样做过。您需要创建一个实现IPipelineStep的自定义管道步骤，并将其添加到NCrawler实现中。我用作连接到Solr的客户端

这里有一些代码可以帮助您开始

 SolrNet.Startup.Init<IndexItem>("http://localhost:8983/solr");

 using(Crawler c = new Crawler("http://ncrawler.codeplex.com/", 
    new HtmlDocumentProcessor(), new AddCrawledItemToSolrIndex()))
 {
     c.ThreadCount = 3;
     c.MaxCrawlDepth = 2;
     c.ExcludeFilter = new[] { new RegexFilter(
        new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)",
            RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)) },
     c.Crawl();
 }

SolrNet.Startup.Init（“http://localhost:8983/solr");
使用（爬虫程序c=新爬虫程序（“http://ncrawler.codeplex.com/", 
新的HtmlDocumentProcessor（），新的AddCrawledItemToSolIndex（））
{
c、 线程数=3；
c、 最大深度=2；
c、 ExcludeFilter=new[]{new RegexFilter(
新的正则表达式（@“（\.jpg\.css\.js\.gif\.jpeg\.png\.ico）”，
RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase）），
c、 爬行（）；
}

自定义伊佩林步骤

using System;
using System.Collections.ObjectModel;
using Microsoft.Practices.ServiceLocation;
using MyCrawler.Index;
using NCrawler;
using NCrawler.Interfaces;
using SolrNet;

namespace MyCrawler.Crawler
{
    public class AddCrawledItemToSolrIndex : IPipelineStep
    {
        public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag)
        {
            if (string.IsNullOrWhiteSpace(propertyBag.Text))
                return;

            var indexItem = new IndexItem
            {
                Id = propertyBag.Step.Uri.ToString(),
                Url = propertyBag.Step.Uri.ToString(),
                Host = propertyBag.Step.Uri.Host,
                Content = propertyBag.Text,
                Title = propertyBag.Title,
                LastModified = Convert.ToInt64(DateTimeToUnixTimestamp(propertyBag.LastModified)),
                Date = propertyBag.LastModified.ToString("yyyyMMdd"),
                Keywords = ExtractKeywords(propertyBag.Headers),
                Type = SplitString(propertyBag.ContentType, ';'),
                Digest = CreateMD5Hash(propertyBag.Text),
            };
            var solr = ServiceLocator.Current.GetInstance<ISolrOperations<IndexItem>>();
            solr.Add(indexItem, new AddParameters {CommitWithin = 10000});
        }

        private Collection<string> SplitString(string input, char splitOn)
        {
            var values = input.Split(splitOn);
            var valueCollection = new Collection<string>();
            if (values.Length == 0) return valueCollection;
            foreach (var value in values)
            {
                valueCollection.Add(value.Trim());
            }

            return valueCollection;

        } 

        private double DateTimeToUnixTimestamp(DateTime dateTime)
        {
            return (dateTime - new DateTime(1970, 1, 1).ToLocalTime()).TotalSeconds;
        }

        private string CreateMD5Hash(string input)
        {
            // Use input string to calculate MD5 hash
            var md5 = MD5.Create();
            var inputBytes = Encoding.ASCII.GetBytes(input);
            var hashBytes = md5.ComputeHash(inputBytes);

            // Convert the byte array to hexadecimal string
            var sb = new StringBuilder();
            for (int i = 0; i < hashBytes.Length; i++)
            {
                //sb.Append(hashBytes[i].ToString("X2"));
                // To force the hex string to lower-case letters instead of
                // upper-case, use he following line instead:
                sb.Append(hashBytes[i].ToString("x2")); 
            }
            return sb.ToString();
        }


        private Collection<string> ExtractKeywords(System.Net.WebHeaderCollection headers)
        {
            var keywords = headers["keywords"];
            if (string.IsNullOrWhiteSpace(keywords))
            {
                return new Collection<string>();
            }

            return SplitString(keywords, ',');
        }
    }
}

使用系统；
使用System.Collections.ObjectModel；
使用Microsoft.Practices.ServiceLocation；
使用MyCrawler.Index；
使用NCrawler；
使用NCrawler.Interfaces；
使用SolrNet；
名称空间MyCrawler.Crawler
{
公共类AddCrawledItemToSolIndex:IPipelineStep
{
公共作废流程（NCrawler.Crawler Crawler，PropertyBag PropertyBag）
{
if（string.IsNullOrWhiteSpace（propertyBag.Text））
返回；
var indexItem=新的indexItem
{
Id=propertyBag.Step.Uri.ToString（），
Url=propertyBag.Step.Uri.ToString（），
Host=propertyBag.Step.Uri.Host，
Content=propertyBag.Text，
Title=propertyBag.Title，
LastModified=Convert.ToInt64（DateTimeToUnixTimestamp（propertyBag.LastModified）），
日期=propertyBag.LastModified.ToString（“yyyyMMdd”），
关键词=提取关键词（propertyBag.Header），
类型=拆分字符串（propertyBag.ContentType，；），
Digest=CreateMD5Hash（propertyBag.Text），
};
var solr=ServiceLocator.Current.GetInstance（）；
Add（indexItem，newaddparameters{CommitWithin=10000}）；
}
私有集合拆分字符串（字符串输入，字符拆分）
{
var值=输入。拆分（拆分）；
var valueCollection=新集合（）；
if（values.Length==0）返回valueCollection；
foreach（值中的var值）
{
valueCollection.Add（value.Trim（））；
}
返回值收集；
} 
专用双DateTimeToUnix时间戳（DateTime DateTime）
{
return（dateTime-newdatetime（1970,1,1）.ToLocalTime（））.TotalSeconds；
}
私有字符串CreateMD5Hash（字符串输入）
{
//使用输入字符串计算MD5哈希
var md5=md5.Create（）；
var inputBytes=Encoding.ASCII.GetBytes（输入）；
var hashBytes=md5.ComputeHash（inputBytes）；
//将字节数组转换为十六进制字符串
var sb=新的StringBuilder（）；
for（int i=0；i


这将使用以下IndexItem.cs类映射到Solr索引字段
using System.Collections.ObjectModel;
using SolrNet.Attributes;

namespace MyCrawler.Index
{
    public class IndexItem
    {
        [SolrField("id")]
        public string Id { get; set; }
        [SolrField("url")]
        public string Url { get; set; }
        [SolrField("host")]
        public string Host { get; set; }
        [SolrField("content")]
        public string Content { get; set; }
        [SolrField("title")]
        public string Title { get; set; }
        [SolrField("description")]
        public string Description { get; set; }
        [SolrField("digest")]
        public string Digest { get; set; }
        [SolrField("keywords")]
        public Collection<string> Keywords { get; set; }
        [SolrField("date")]
        public string Date { get; set; }
        [SolrField("contentLength")]
        public long ContentLength { get; set; }
        [SolrField("lastModified")]
        public long LastModified { get; set; }
        [SolrField("type")]
        public Collection<string> Type { get; set; }
    }
}

使用System.Collections.ObjectModel；
使用SolrNet.Attributes；
名称空间MyCrawler.Index
{
公共类索引
{
[索尔菲尔德（“id”）]
公共字符串Id{get；set；}
[索尔菲尔德（“url”）]
公共字符串Url{get；set；}
[索尔菲尔德（“主持人”）]
公共字符串主机{get；set；}
[索尔菲尔德（“内容”）]
公共字符串内容{get；set；}
[索尔菲尔德（“头衔”）]
公共字符串标题{get；set；}
[索尔菲尔德（“描述”）]
公共字符串说明{get；set；}
[索尔菲尔德（“摘要”）]
公共字符串摘要{get；set；}
[索尔菲尔德（“关键词”）]
公共集合关键字{get；set；}
[索尔菲尔德（“日期”）]
公共字符串日期{get；set；}
[索尔菲尔德（“内容长度”）]
公共长内容长度{get；set；}
[SolrField（“lastModified”）]
公共长LastModified{get；set；}
[索尔菲尔德（“类型”）]
公共集合类型{get；set；}
}
}

Solr字段定义（schema.xml）取自Nutch代码库




显然，您会想修改它以满足您的需要，并且它可能需要一些性能改进。但是应该是一个很好的参考点。
是的，可以将爬网数据索引到Solr。我以前也这样做过。您需要创建一个实现IPipelineStep的自定义管道步骤，并将其添加到NCrawler实现中。我用作连接到Solr的客户端
这里有一些代码可以帮助您开始
 SolrNet.Startup.Init<IndexItem>("http://localhost:8983/solr");

 using(Crawler c = new Crawler("http://ncrawler.codeplex.com/", 
    new HtmlDocumentProcessor(), new AddCrawledItemToSolrIndex()))
 {
     c.ThreadCount = 3;
     c.MaxCrawlDepth = 2;
     c.ExcludeFilter = new[] { new RegexFilter(
        new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)",
            RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)) },
     c.Crawl();
 }

SolrNet.Startup.Init（“http://localhost:8983/solr");
使用（爬虫程序c=新爬虫程序（“http://ncrawle