Java ApachePOI-如何从Word文档中删除所有链接
我想删除Word文档的所有超链接并保留文本。我有这两种方法来读取带有doc和docx扩展名的word文档Java ApachePOI-如何从Word文档中删除所有链接,java,apache-poi,hwpf,xwpf,Java,Apache Poi,Hwpf,Xwpf,我想删除Word文档的所有超链接并保留文本。我有这两种方法来读取带有doc和docx扩展名的word文档 private void readDocXExtensionDocument(){ File inputFile = new File(inputFolderDir, "test.docx"); try { XWPFDocument document = new XWPFDocument(OPCPackage.open(new FileInputStrea
private void readDocXExtensionDocument(){
File inputFile = new File(inputFolderDir, "test.docx");
try {
XWPFDocument document = new XWPFDocument(OPCPackage.open(new FileInputStream(inputFile)));
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
extractor.setFetchHyperlinks(true);
String context = extractor.getText();
System.out.println(context);
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void readDocExtensionDocument(){
File inputFile = new File(inputFolderDir, "test.doc");
POIFSFileSystem fs;
try {
fs = new POIFSFileSystem(new FileInputStream(inputFile));
HWPFDocument document = new HWPFDocument(fs);
WordExtractor wordExtractor = new WordExtractor(document);
String[] paragraphs = wordExtractor.getParagraphText();
System.out.println("Word document has " + paragraphs.length + " paragraphs");
for(int i=0; i<paragraphs.length; i++){
paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
System.out.println(paragraphs[i]);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private void readDocXExtensionDocument(){
File inputFile=新文件(inputFolderDir,“test.docx”);
试一试{
XWPFDocument document=newxwpfdocument(OPCPackage.open(newfileinputstream(inputFile)));
XWPFWordExtractor extractor=新的XWPFWordExtractor(文件);
setFetchHyperlinks(true);
字符串上下文=提取器.getText();
System.out.println(上下文);
}捕获(无效格式){
e、 printStackTrace();
}catch(filenotfounde异常){
e、 printStackTrace();
}捕获(IOE异常){
e、 printStackTrace();
}
}
私有void readDocExtensionDocument(){
File inputFile=新文件(inputFolderDir,“test.doc”);
POIFSFS;
试一试{
fs=新的POIFSSFILE(新文件InputStream(inputFile));
HWPF文件=新的HWPF文件(fs);
WordExtractor WordExtractor=新的WordExtractor(文档);
String[]段落=wordExtractor.getParagraphText();
System.out.println(“Word文档有“+段落.长度+”段落”);
对于(inti=0;i我的解决方案,至少对于.docx类别,是使用正则表达式。请查看此项
private void readDocXExtensionDocument(){
Pattern p = Pattern.compile("\\<(.+?)\\>");
File inputFile = new File(inputFolderDir, "test.docx");
try {
XWPFDocument document = new XWPFDocument(OPCPackage.open(new FileInputStream(inputFile)));
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
extractor.setFetchHyperlinks(true);
String context = extractor.getText();
Matcher m = p.matcher(context);
while (m.find()) {
String link = m.group(0); // the bracketed part
String textString = m.group(1); // the text of the link without the brackets
context = context.replaceAll(link, ""); // ordering important. Link then textString
context = context.replaceAll(textString, "");
}
System.out.println(context);
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void readDocXExtensionDocument(){
Pattern p=Pattern.compile(“\\”);
File inputFile=新文件(inputFolderDir,“test.docx”);
试一试{
XWPFDocument document=newxwpfdocument(OPCPackage.open(newfileinputstream(inputFile)));
XWPFWordExtractor extractor=新的XWPFWordExtractor(文件);
setFetchHyperlinks(true);
字符串上下文=提取器.getText();
Matcher m=p.Matcher(上下文);
while(m.find()){
String link=m.group(0);//带括号的部分
String textString=m.group(1);//不带括号的链接文本
context=context.replaceAll(link,“”;//排序重要。链接然后文本字符串
context=context.replaceAll(textString,“”);
}
System.out.println(上下文);
}捕获(无效格式){
e、 printStackTrace();
}catch(filenotfounde异常){
e、 printStackTrace();
}捕获(IOE异常){
e、 printStackTrace();
}
}
这种方法唯一需要注意的是,如果包含这些尖括号的材质不是链接,也可以删除。如果您对可能出现的链接类型有更好的了解,可以尝试使用更具体的正则表达式,而不是我提供的正则表达式