Java ApachePOI-如何从Word文档中删除所有链接_Java_Apache Poi_Hwpf_Xwpf

Java ApachePOI-如何从Word文档中删除所有链接

java

Java ApachePOI-如何从Word文档中删除所有链接,java,apache-poi,hwpf,xwpf,Java,Apache Poi,Hwpf,Xwpf,我想删除Word文档的所有超链接并保留文本。我有这两种方法来读取带有doc和docx扩展名的word文档 private void readDocXExtensionDocument(){ File inputFile = new File(inputFolderDir, "test.docx"); try { XWPFDocument document = new XWPFDocument(OPCPackage.open(new FileInputStrea

我想删除Word文档的所有超链接并保留文本。我有这两种方法来读取带有doc和docx扩展名的word文档

private void readDocXExtensionDocument(){
    File inputFile = new File(inputFolderDir, "test.docx");
    try {
        XWPFDocument document = new XWPFDocument(OPCPackage.open(new   FileInputStream(inputFile)));
        XWPFWordExtractor extractor = new XWPFWordExtractor(document);
        extractor.setFetchHyperlinks(true);
        String context =  extractor.getText();
        System.out.println(context);
    } catch (InvalidFormatException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

private void readDocExtensionDocument(){
    File inputFile = new File(inputFolderDir, "test.doc");
    POIFSFileSystem fs;
    try {
        fs = new POIFSFileSystem(new FileInputStream(inputFile));
        HWPFDocument document = new HWPFDocument(fs);
        WordExtractor wordExtractor = new WordExtractor(document);
        String[] paragraphs = wordExtractor.getParagraphText();
        System.out.println("Word document has " + paragraphs.length + " paragraphs");
        for(int i=0; i<paragraphs.length; i++){
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            System.out.println(paragraphs[i]);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

private void readDocXExtensionDocument（）{
File inputFile=新文件（inputFolderDir，“test.docx”）；
试一试{
XWPFDocument document=newxwpfdocument（OPCPackage.open（newfileinputstream（inputFile）））；
XWPFWordExtractor extractor=新的XWPFWordExtractor（文件）；
setFetchHyperlinks（true）；
字符串上下文=提取器.getText（）；
System.out.println（上下文）；
}捕获（无效格式）{
e、 printStackTrace（）；
}catch（filenotfounde异常）{
e、 printStackTrace（）；
}捕获（IOE异常）{
e、 printStackTrace（）；
}
}
私有void readDocExtensionDocument（）{
File inputFile=新文件（inputFolderDir，“test.doc”）；
POIFSFS；
试一试{
fs=新的POIFSSFILE（新文件InputStream（inputFile））；
HWPF文件=新的HWPF文件（fs）；
WordExtractor WordExtractor=新的WordExtractor（文档）；
String[]段落=wordExtractor.getParagraphText（）；
System.out.println（“Word文档有“+段落.长度+”段落”）；
对于（inti=0；i我的解决方案，至少对于.docx类别，是使用正则表达式。请查看此项
private void readDocXExtensionDocument(){
   Pattern p = Pattern.compile("\\<(.+?)\\>");
   File inputFile = new File(inputFolderDir, "test.docx");
   try {
      XWPFDocument document = new XWPFDocument(OPCPackage.open(new   FileInputStream(inputFile)));
      XWPFWordExtractor extractor = new XWPFWordExtractor(document);
      extractor.setFetchHyperlinks(true);
      String context =  extractor.getText();
      Matcher m = p.matcher(context);
      while (m.find()) {
         String link = m.group(0); // the bracketed part
         String textString = m.group(1); // the text of the link without the brackets
         context = context.replaceAll(link, ""); // ordering important.  Link then textString
         context = context.replaceAll(textString, "");
      }
      System.out.println(context);
   } catch (InvalidFormatException e) {
    e.printStackTrace();
   } catch (FileNotFoundException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
  }

private void readDocXExtensionDocument（）{
Pattern p=Pattern.compile（“\\”）；
File inputFile=新文件（inputFolderDir，“test.docx”）；
试一试{
XWPFDocument document=newxwpfdocument（OPCPackage.open（newfileinputstream（inputFile）））；
XWPFWordExtractor extractor=新的XWPFWordExtractor（文件）；
setFetchHyperlinks（true）；
字符串上下文=提取器.getText（）；
Matcher m=p.Matcher（上下文）；
while（m.find（））{
String link=m.group（0）；//带括号的部分
String textString=m.group（1）；//不带括号的链接文本
context=context.replaceAll（link，“”；//排序重要。链接然后文本字符串
context=context.replaceAll（textString，“”）；
}
System.out.println（上下文）；
}捕获（无效格式）{
e、 printStackTrace（）；
}catch（filenotfounde异常）{
e、 printStackTrace（）；
}捕获（IOE异常）{
e、 printStackTrace（）；
}
}

这种方法唯一需要注意的是，如果包含这些尖括号的材质不是链接，也可以删除。如果您对可能出现的链接类型有更好的了解，可以尝试使用更具体的正则表达式，而不是我提供的正则表达式