Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/77.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 从pdf文本到文件列中文件名的整齐数据框_R_Pdf_Text Mining_Corpus_Tidytext - Fatal编程技术网

R 从pdf文本到文件列中文件名的整齐数据框

R 从pdf文本到文件列中文件名的整齐数据框,r,pdf,text-mining,corpus,tidytext,R,Pdf,Text Mining,Corpus,Tidytext,我想分析近300个pdf文档中的文本。现在,我使用pdftools和tm,tidytext包来阅读文本,将其转换为语料库,然后转换为文档术语矩阵,最后我希望在一个整洁的数据框中对其进行结构构建 我有几个问题: 如何删除页面数据(在每个pdf页面的顶部和/或底部) 我更希望文件名作为文档列中的值,而不是索引数字 以下代码仅包含2个pdf文件,以确保再现性。当我运行所有文件时,我在corpus对象中得到294个文档,但当我整理它时,我似乎丢失了一些文件,因为converted%>%distinct

我想分析近300个pdf文档中的文本。现在,我使用
pdftools
tm
tidytext
包来阅读文本,将其转换为语料库,然后转换为文档术语矩阵,最后我希望在一个整洁的数据框中对其进行结构构建

我有几个问题:

  • 如何删除页面数据(在每个pdf页面的顶部和/或底部)
  • 我更希望文件名作为
    文档
    列中的值,而不是索引数字
  • 以下代码仅包含2个pdf文件,以确保再现性。当我运行所有文件时,我在
    corpus
    对象中得到294个文档,但当我整理它时,我似乎丢失了一些文件,因为
    converted%>%distinct(document)
    返回了275个文档。我想知道这是为什么
我有以下可复制的脚本:

library(tidyverse)
library(tidytext)
library(pdftools)
library(tm)
library(broom)

# Create a temporary empty directory 
# (don't worry at the end of this script I'll remove this directory and its files)

dir.create("~/Desktop/sample-pdfs")

# Fill directory with 2 pdf files from my github repo

download.file("https://github.com/thomasdebeus/colourful-facts/raw/master/projects/sample-data/'s-Gravenhage_coalitieakkoord.pdf", destfile = "~/Desktop/sample-pdfs/'s-Gravenhage_coalitieakkoord.pdf")
download.file("https://github.com/thomasdebeus/colourful-facts/raw/master/projects/sample-data/Aa%20en%20Hunze_coalitieakkoord.pdf", destfile = "~/Desktop/sample-pdfs/Aa en Hunze_coalitieakkoord.pdf")

# Create vector of file paths

dir <- "~/Desktop/sample-pdfs"
pdfs <- paste(dir, "/", list.files(dir, pattern = "*.pdf"), sep = "")

# Read the text from pdf's with pdftools package

pdfs_text <- map(pdfs, pdf_text)

# Convert to document-term-matrix

converted <- Corpus(VectorSource(pdfs_text)) %>%
          DocumentTermMatrix()

# Now I want to convert this to a tidy format

converted %>%
          tidy() %>%
          filter(!grepl("[0-9]+", term))
从运行以下行的桌面删除下载的文件及其目录:

unlink("~/Desktop/sample-pdfs", recursive = TRUE)

非常感谢您的帮助 我建议为您想要执行的操作编写一个包装函数,这样您就可以将每个文件名添加为一列

read_PDF <- function(file){

    pdfs_text <- pdf_text(file)
    converted <- Corpus(VectorSource(pdfs_text)) %>%
          DocumentTermMatrix()
    converted %>%
          tidy() %>%
          filter(!grepl("[0-9]+", term)) %>%

          # add FileName as a column
          mutate(FileName = file)
}

final <- map(pdfs, read_PDF) %>% data.table::rbindlist()
read\u PDF%
过滤器(!grepl(“[0-9]+”,术语))%>%
#将文件名添加为列
变异(文件名=文件)
}
最终%data.table::rbindlist()

我建议为要执行的操作编写一个包装函数,这样您就可以将每个文件名添加为一列

read_PDF <- function(file){

    pdfs_text <- pdf_text(file)
    converted <- Corpus(VectorSource(pdfs_text)) %>%
          DocumentTermMatrix()
    converted %>%
          tidy() %>%
          filter(!grepl("[0-9]+", term)) %>%

          # add FileName as a column
          mutate(FileName = file)
}

final <- map(pdfs, read_PDF) %>% data.table::rbindlist()
read\u PDF%
过滤器(!grepl(“[0-9]+”,术语))%>%
#将文件名添加为列
变异(文件名=文件)
}
最终%data.table::rbindlist()
很好的例子

  • 我添加了几行来添加名称
  • 我不确定是否会丢失文件,我没有这种行为
  • 仅提及您的文件名不是很标准,建议再次检查名称,并且您在第一个文件的开头有一个撇号。还将建议对空间进行清洁
  • 我用英语文档做了测试,你可以在语料库中添加不同的语言
代码如下:

library(tidyverse)
library(tidytext)
library(pdftools) 
library(tm)
library(broom)

# Create a temporary empty directory

dir <- "PDFs/"
pdfs <- paste0(dir, list.files(dir, pattern = "*.pdf"))
names <- list.files(dir, pattern = "*.pdf")

# create a table of names
namesDocs <- 
    names %>% 
    str_remove(pattern = ".pdf") %>% 
    as.tibble() %>% 
    mutate(ids = as.character(seq_along(names)))

namesDocs
# Read the text from pdf's with pdftools package

pdfs_text <- map(pdfs, pdftools::pdf_text)

# Convert to document-term-matrix
# add cleaning process

converted <-
    Corpus(VectorSource(pdfs_text)) %>%
    DocumentTermMatrix(
        control = list(removeNumbers = TRUE,
                       stopwords = TRUE,
                       removePunctuation = TRUE))

converted
# Now I want to convert this to a tidy format
# add names of documents

mytable <-
  converted %>%
  tidy() %>%
  arrange(desc(count)) %>% 
  left_join(y = namesDocs, by = c("document" = "ids"))

head(mytable)

View(mytable)
库(tidyverse)
图书馆(tidytext)
图书馆(pdftools)
图书馆(tm)
图书馆(扫帚)
#创建一个临时空目录
dir很好的例子

  • 我添加了几行来添加名称
  • 我不确定是否会丢失文件,我没有这种行为
  • 仅提及您的文件名不是很标准,建议再次检查名称,并且您在第一个文件的开头有一个撇号。还将建议对空间进行清洁
  • 我用英语文档做了测试,你可以在语料库中添加不同的语言
代码如下:

library(tidyverse)
library(tidytext)
library(pdftools) 
library(tm)
library(broom)

# Create a temporary empty directory

dir <- "PDFs/"
pdfs <- paste0(dir, list.files(dir, pattern = "*.pdf"))
names <- list.files(dir, pattern = "*.pdf")

# create a table of names
namesDocs <- 
    names %>% 
    str_remove(pattern = ".pdf") %>% 
    as.tibble() %>% 
    mutate(ids = as.character(seq_along(names)))

namesDocs
# Read the text from pdf's with pdftools package

pdfs_text <- map(pdfs, pdftools::pdf_text)

# Convert to document-term-matrix
# add cleaning process

converted <-
    Corpus(VectorSource(pdfs_text)) %>%
    DocumentTermMatrix(
        control = list(removeNumbers = TRUE,
                       stopwords = TRUE,
                       removePunctuation = TRUE))

converted
# Now I want to convert this to a tidy format
# add names of documents

mytable <-
  converted %>%
  tidy() %>%
  arrange(desc(count)) %>% 
  left_join(y = namesDocs, by = c("document" = "ids"))

head(mytable)

View(mytable)
库(tidyverse)
图书馆(tidytext)
图书馆(pdftools)
图书馆(tm)
图书馆(扫帚)
#创建一个临时空目录

dir您可以使用tm将文档直接读入语料库。readPDF阅读器使用pdftools作为引擎。无需首先创建一组文本,通过语料库获取输出。我创建了两个示例。第一个与你所做的一致,但首先要通过语料库。第二个完全基于tidyverse+tidytext。不需要在tm、tidytext等之间切换

示例之间令牌数量的差异是由于tidytext/tokenizer中的自动清理造成的

如果您有很多文档要做,您可能希望使用
quanteda
作为您的主力,因为它可以在多个核心上即时工作,并可能加快标记器部分的速度。不要忘记使用
stopwords
软件包来获得荷兰stopwords的良好列表。如果您需要荷兰语单词的词性标注,请检查
updipe
软件包

library(tidyverse)
library(tidytext)
library(tm)

directory <- "D:/sample-pdfs"

# create corpus from pdfs
converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% 
  DocumentTermMatrix()


converted %>%
  tidy() %>%
  filter(!grepl("[0-9]+", term))

# A tibble: 5,707 x 3
   document                          term           count
   <chr>                             <chr>          <dbl>
 1 's-Gravenhage_coalitieakkoord.pdf "\ade"             4
 2 's-Gravenhage_coalitieakkoord.pdf "\adeze"           1
 3 's-Gravenhage_coalitieakkoord.pdf "\aeen"            2
 4 's-Gravenhage_coalitieakkoord.pdf "\aer"             2
 5 's-Gravenhage_coalitieakkoord.pdf "\aextra"          2
 6 's-Gravenhage_coalitieakkoord.pdf "\agroei"          1
 7 's-Gravenhage_coalitieakkoord.pdf "\ahet"            1
 8 's-Gravenhage_coalitieakkoord.pdf "\amet"            1
 9 's-Gravenhage_coalitieakkoord.pdf "\aonderwijs,"     1
10 's-Gravenhage_coalitieakkoord.pdf "\aop"            11
# ... with 5,697 more rows
库(tidyverse)
图书馆(tidytext)
图书馆(tm)
目录%
整洁()%>%
过滤器(!grepl(“[0-9]+”,术语))
#A tibble:5707 x 3
文件期限计数
1 s-Gravenhage_coalitieakkoord.pdf“\ade”4
2 s-Gravenhage_coaliteakoord.pdf“\adeze”1
3 s-Gravenhage_coalitieakkoord.pdf“\aeen”2
4 s-Gravenhage_coaliteakoord.pdf“\aer”2
5 s-Gravenhage_coaliteakoord.pdf“\aextra”2
6 s-Gravenhage_coalitieakkoord.pdf“\agroei”1
7's-Gravenhage_coaliteakoord.pdf“\ahet”1
8 s-Gravenhage_coalitieakkoord.pdf“\amet”1
9's-Gravenhage_coaliteakoord.pdf“\aonderwijs,”1
10 s-Gravenhage_coalitieakkoord.pdf“\aop”11
# ... 还有5697行
只使用tidytext而不是tm

directory <- "D:/sample-pdfs"

pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "")
pdf_names <- list.files(directory, pattern = "*.pdf")
pdfs_text <- map(pdfs, pdftools::pdf_text)


my_data <- data_frame(document = pdf_names, text = pdfs_text)

my_data %>% 
  unnest %>% # pdfs_text is a list
  unnest_tokens(word, text, strip_numeric = TRUE) %>%  # removing all numbers
  group_by(document, word) %>% 
  summarise(count = n())
# A tibble: 4,646 x 3
# Groups:   document [?]
   document                          word                    count
   <chr>                             <chr>                   <int>
 1 's-Gravenhage_coalitieakkoord.pdf 1e                          2
 2 's-Gravenhage_coalitieakkoord.pdf 2e                          2
 3 's-Gravenhage_coalitieakkoord.pdf 3e                          1
 4 's-Gravenhage_coalitieakkoord.pdf 4e                          1
 5 's-Gravenhage_coalitieakkoord.pdf aan                       164
 6 's-Gravenhage_coalitieakkoord.pdf aanbesteding                2
 7 's-Gravenhage_coalitieakkoord.pdf aanbestedingen              1
 8 's-Gravenhage_coalitieakkoord.pdf aanbestedingsprocedures     1
 9 's-Gravenhage_coalitieakkoord.pdf aanbevelingen               1
10 's-Gravenhage_coalitieakkoord.pdf aanbieden                   4
# ... with 4,636 more rows

directory您可以使用tm将文档直接读入语料库。readPDF阅读器使用pdftools作为引擎。无需首先创建一组文本,通过语料库获取输出。我创建了两个示例。第一个与你所做的一致,但首先要通过语料库。第二个完全基于tidyverse+tidytext。不需要在tm、tidytext等之间切换

示例之间令牌数量的差异是由于tidytext/tokenizer中的自动清理造成的

如果您有很多文档要做,您可能希望使用
quanteda
作为您的主力,因为它可以在多个核心上即时工作,并可能加快标记器部分的速度。不要忘记使用
stopwords
软件包来获得荷兰stopwords的良好列表。如果您需要荷兰语单词的词性标注,请检查
updipe
软件包

library(tidyverse)
library(tidytext)
library(tm)

directory <- "D:/sample-pdfs"

# create corpus from pdfs
converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% 
  DocumentTermMatrix()


converted %>%
  tidy() %>%
  filter(!grepl("[0-9]+", term))

# A tibble: 5,707 x 3
   document                          term           count
   <chr>                             <chr>          <dbl>
 1 's-Gravenhage_coalitieakkoord.pdf "\ade"             4
 2 's-Gravenhage_coalitieakkoord.pdf "\adeze"           1
 3 's-Gravenhage_coalitieakkoord.pdf "\aeen"            2
 4 's-Gravenhage_coalitieakkoord.pdf "\aer"             2
 5 's-Gravenhage_coalitieakkoord.pdf "\aextra"          2
 6 's-Gravenhage_coalitieakkoord.pdf "\agroei"          1
 7 's-Gravenhage_coalitieakkoord.pdf "\ahet"            1
 8 's-Gravenhage_coalitieakkoord.pdf "\amet"            1
 9 's-Gravenhage_coalitieakkoord.pdf "\aonderwijs,"     1
10 's-Gravenhage_coalitieakkoord.pdf "\aop"            11
# ... with 5,697 more rows
库(tidyverse)
图书馆(tidytext)
图书馆(tm)
目录%
整洁()%>%
过滤器(!grepl(“[0-9]+”,术语))
#A tibble:5707 x 3
文件期限计数
1 s-Gravenhage_coalitieakkoord.pdf“\ade”4
2 s-Gravenhage_coaliteakoord.pdf“\adeze”1
3's-Gravenhage_coalitieakkoord.pdf“\aeen”
    # set path to the PDF 
pdf_path <- "PDFs/"

# List the PDFs 
pdfs <- list.files(path = pdf_path, pattern = 'pdf$',  full.names = TRUE) 

# Import the PDFs into R
spill_texts <- readtext(pdfs, 
                        docvarsfrom = "filenames", 
                        sep = "_", 
                        docvarnames = c("First_author", "Year"))

# Transform the pdfs into a corpus object
spill_corpus  <- corpus(spill_texts)
spill_corpus
# Some stats about the pdfs
tokenInfo <- summary(spill_corpus)
tokenInfo