C# 将每个单独单词的坐标提取到pdf文件中的文本块中

C# 将每个单独单词的坐标提取到pdf文件中的文本块中,c#,pdf,itext,C#,Pdf,Itext,接下来,我试图获取文本块中的所有单词及其每个坐标(实际页面,顶部,底部,左侧,右侧) 由于textcunk可以是一个短语、一个单词或任何东西,因此我尝试手动执行此操作,依靠最后一个单词的矩形并每次剪切它。我注意到这个手动方法可能有很多问题(我需要手动计算特殊字符等等),所以我问自己ITextSharp是否提供了更简单的方法来执行这个操作 MyChunk和LocationTextExtractionsTramy继承的类如下所示: public class Chunk { public Gu

接下来,我试图获取
文本块
中的所有单词及其每个坐标(
实际页面
顶部
底部
左侧
右侧

由于
textcunk
可以是一个短语、一个单词或任何东西,因此我尝试手动执行此操作,依靠最后一个单词的矩形并每次剪切它。我注意到这个手动方法可能有很多问题(我需要手动计算特殊字符等等),所以我问自己ITextSharp是否提供了更简单的方法来执行这个操作

My
Chunk
LocationTextExtractionsTramy
继承的类如下所示:

public class Chunk
{
    public Guid Id { get; set; }
    public Rectangle Rect { get; set; }
    public TextRenderInfo Render { get; set; }
    public BaseFont BF { get; set; }
    public string Text { get; set; }
    public int FontSize { get; set; }


    public Chunk(Rectangle rect, TextRenderInfo renderInfo)
    {
        this.Rect = rect;
        this.Render = renderInfo;
        this.Text = Render.GetText();
        Initialize();
    }

        
    public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
    {
        this.Rect = rect;
        this.Render = renderInfo;
        this.Text = text;
        Initialize();
    }

    
    private void Initialize()
    {
        this.Id = Guid.NewGuid();
        this.BF = Render.GetFont();
        this.FontSize = ObtainFontSize();
    }

    private int ObtainFontSize()
    {
        return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
    }
}

public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
    //Save each coordinate
    public List<Chunk> ChunksInPage = new List<Chunk>();
        
    //Automatically called on each chunk on PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);
        if (string.IsNullOrWhiteSpace(renderInfo.GetText())
                || renderInfo == null)
                return;

        //Get chunk Vectors
        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
        var topRight = renderInfo.GetAscentLine().GetEndPoint();

        //Create Rectangle based on previous Vectors
        var rect = new Rectangle(
                           bottomLeft[Vector.I1],
                           bottomLeft[Vector.I2],
                           topRight[Vector.I1],
                           topRight[Vector.I2]);

        if (rect == null)
                return;

        //Add each chunk with its coordinates
        ChunksInPage.Add(new Chunk(rect, renderInfo));
    }
}
公共类块
{
公共Guid Id{get;set;}
公共矩形Rect{get;set;}
公共文本RenderInfo呈现{get;set;}
公共BaseFont BF{get;set;}
公共字符串文本{get;set;}
公共int FontSize{get;set;}
公共块(矩形rect,TextRenderInfo renderInfo)
{
this.Rect=Rect;
this.Render=renderInfo;
this.Text=Render.GetText();
初始化();
}
公共块(矩形rect、TextRenderInfo renderInfo、字符串text)
{
this.Rect=Rect;
this.Render=renderInfo;
this.Text=文本;
初始化();
}
私有void初始化()
{
this.Id=Guid.NewGuid();
this.BF=Render.GetFont();
this.FontSize=获取FontSize();
}
私有int获取方大小()
{
返回Convert.ToInt32(this.Render.GetSingleSpaceWidth()*12/this.BF.GetWidthPoint(“,12));
}
}
公共类LocationTextExtractionPersonalizada:LocationTextExtractionStrategy
{
//保存每个坐标
public List ChunksInPage=new List();
//自动调用PDF上的每个块
公共覆盖无效RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
if(string.IsNullOrWhiteSpace(renderInfo.GetText())
||renderInfo==null)
返回;
//获取块向量
var bottomLeft=renderInfo.GetDescentLine().GetStartPoint();
var topRight=renderInfo.GetAscentLine().GetEndPoint();
//基于前面的向量创建矩形
var rect=新矩形(
左下[Vector.I1],
左下[Vector.I2],
右上[Vector.I1],
右上[Vector.I2]);
if(rect==null)
返回;
//添加每个块及其坐标
添加(新块(rect,renderInfo));
}
}
因此,一旦我得到了文件等等,我会这样做:

private void ProcessContent()
{
    for (int page= 1; page <= pdfReader.NumberOfPages; page++)
    {
        var strategy = new LocationTextExtractionPersonalizada();

        var currentPageText = PdfTextExtractor.GetTextFromPage(
                                          pdfReader,
                                          page,
                                          strategy);
        
        //Here is where I want to get each word with its coordinates
        var chunksWords= ChunkRawToWord(strategy.ChunksInPage);
    }
}

private List<Chunk> ChunkRawToWord(IList<Chunk> chunks)
{
    if (chunks == null || chunks[0] == null)
            return null;

    var words = new List<Chunk>();
    //Poor RegEx pattern to get the word and its wathever
    string pattern = @"[@&\w+]*(-*\/*\s*\:*\;*\,*\.*\(*\)*\%*\>*\<*)?";

    var something = chunks[0].Render.GetCharacterRenderInfos();

    for (int i = 0; i < chunks.Count; i++)
    {
        var wordsInChunk = Regex.Matches(
                                          chunks[i].Text,
                                          pattern,
                                          RegexOptions.IgnoreCase);
                

        var rectangleChunk = new Rectangle(chunks[i].Rect);
        for (int j = 0; j < wordsInChunk.Count; j++)
        {
            if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value))
                continue;

        var word = new Chunk(
                                   rectangleChunk, 
                                   chunks[i].Render, 
                                   wordsInChunk[j].ToString());
                    
            if (j == 0)
            {
                word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize);
                    words.Add(word);
                    continue;
            }

            if (words.Count <= 0)
                continue;

            word.Rect.Left = words[j - 1].Rect.Right;
            word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize);
            words.Add(word);
        }
    }

    return words;
}
private void ProcessContent()
{

对于(int page=1;page您可以使用方法
textrendrinfo.GetCharacterRenderInfos()
获取区块中每个字符的
TextRenderInfo
集合。然后可以将单个字符重新组合为单词,并使用该单词中第一个和最后一个
TextRenderInfo
的坐标计算包含单词的矩形

在自定义文本提取策略中:

 var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
 protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
    {
        var resultInfo = new List<TextRenderInfo>();
        var chars = currentInfo.GetCharacterRenderInfos();

        foreach (var charRenderInfo in chars)
        {
            resultInfo.Add(charRenderInfo);
            var currentChar = charRenderInfo.GetText();
            if (_separators.Contains(currentChar))
            {
                ProcessWord(currentInfo, resultInfo);
                resultInfo.Clear();
            }
        }
        ProcessWord(currentInfo, resultInfo);
    }
 private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
    {
        var firstRender = wordChunks.FirstOrDefault();
        var lastRender = wordChunks.LastOrDefault();
        if (firstRender == null || lastRender == null)
        {
            return;
        }
        var startCoords = firstRender.GetDescentLine().GetStartPoint();
        var endCoords = lastRender.GetAscentLine().GetEndPoint();
        var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
        var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
        _chunks.Add(new CustomTextChunk(wordText, wordLocation));
    }
var_分隔符=new[]{-”,“(“,”,“/”,“,”:”,“;”,“,”,“,”};
受保护的虚拟void ParseRenderInfo(TextRenderInfo currentInfo)
{
var resultInfo=新列表();
var chars=currentInfo.getCharacterRenderInfo();
foreach(变量charRenderInfo,单位为chars)
{
结果信息添加(charRenderInfo);
var currentChar=charRenderInfo.GetText();
if(_separators.Contains(currentChar))
{
进程字(currentInfo、resultInfo);
resultInfo.Clear();
}
}
进程字(currentInfo、resultInfo);
}
私有void ProcessWord(TextRenderInfo charChunk,List wordchunk)
{
var firstRender=wordChunks.FirstOrDefault();
var lastdrender=wordChunks.LastOrDefault();
如果(firstRender==null | | lastRender==null)
{
返回;
}
var startCords=firstRender.GetDescentLine().GetStartPoint();
var endCoords=lastRender.GetAscentLine().GetEndPoint();
var wordText=string.Join(“,wordChunks.Select(x=>x.GetText());
var wordLocation=new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startWord、endCoords、charChunk.GetSingleSpaceWidth());
_添加(新的CustomTextChunk(wordText,wordLocation));
}

您可以使用方法
TextRenderInfo.GetCharacterRenderInfos()
获取区块中每个字符的
TextRenderInfo
集合。然后可以将单个字符重新组合为单词,并使用该单词中第一个和最后一个
TextRenderInfo
的坐标计算包含单词的矩形

在自定义文本提取策略中:

 var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
 protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
    {
        var resultInfo = new List<TextRenderInfo>();
        var chars = currentInfo.GetCharacterRenderInfos();

        foreach (var charRenderInfo in chars)
        {
            resultInfo.Add(charRenderInfo);
            var currentChar = charRenderInfo.GetText();
            if (_separators.Contains(currentChar))
            {
                ProcessWord(currentInfo, resultInfo);
                resultInfo.Clear();
            }
        }
        ProcessWord(currentInfo, resultInfo);
    }
 private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
    {
        var firstRender = wordChunks.FirstOrDefault();
        var lastRender = wordChunks.LastOrDefault();
        if (firstRender == null || lastRender == null)
        {
            return;
        }
        var startCoords = firstRender.GetDescentLine().GetStartPoint();
        var endCoords = lastRender.GetAscentLine().GetEndPoint();
        var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
        var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
        _chunks.Add(new CustomTextChunk(wordText, wordLocation));
    }
var_分隔符=new[]{-”,“(“,”,“/”,“,”:”,“;”,“,”,“,”};
受保护的虚拟void ParseRenderInfo(TextRenderInfo currentInfo)
{
var resultInfo=新列表();
var chars=currentInfo.getCharacterRenderInfo();
foreach(变量charRenderInfo,单位为chars)
{
结果信息添加(charRenderInfo);
var currentChar=charRenderInfo.GetText();
if(_separators.Contains(currentChar))
{
进程字(currentInfo、resultInfo);
resultInfo.Clear();
}
}
进程字(currentInfo、resultInfo);
}
私有void ProcessWord(TextRenderInfo charChunk,List wordchunk)
{
var firstRender=wordChunks.FirstOrDefault();
var lastdrender=wordChunks.LastOrDefault();
如果(firstRender==null | | lastRender==null)
{