Java Word XML到RTF的转换
我需要通过编程将Word XML文件转换为RTF文件。由于一些第三方库的存在,它已经成为一种需求。有任何API/库可以做到这一点吗 实际上语言不是问题,因为我只需要完成工作。但是Java、.NET语言或Python是首选。Java 我过去曾使用ApachePOI进行解析。它似乎工作得很好。然后在这里写RTF .Net 关于在.Net中写入Word文档。我相信你可以用同一个图书馆来阅读 Python 对于Python 相关问题Java Word XML到RTF的转换,java,.net,python,xml,rtf,Java,.net,Python,Xml,Rtf,我需要通过编程将Word XML文件转换为RTF文件。由于一些第三方库的存在,它已经成为一种需求。有任何API/库可以做到这一点吗 实际上语言不是问题,因为我只需要完成工作。但是Java、.NET语言或Python是首选。Java 我过去曾使用ApachePOI进行解析。它似乎工作得很好。然后在这里写RTF .Net 关于在.Net中写入Word文档。我相信你可以用同一个图书馆来阅读 Python 对于Python 相关问题 另外,.看看。您必须自己设置它,因为我相信演示只允许您上载打开的off
另外,.看看。您必须自己设置它,因为我相信演示只允许您上载打开的office文档。您可以使用AutoIt自动在word中打开XML文件并执行另存为RTF 我使用Word的用户定义函数将RTF文件保存为纯文本进行转换,效果很好。语法非常简单
Python/linux方式:
import uno
from os.path import abspath, isfile, splitext
from com.sun.star.beans import PropertyValue
from com.sun.star.task import ErrorCodeIOException
from com.sun.star.connection import NoConnectException
FAMILY_TEXT = "Text"
FAMILY_SPREADSHEET = "Spreadsheet"
FAMILY_PRESENTATION = "Presentation"
FAMILY_DRAWING = "Drawing"
DEFAULT_OPENOFFICE_PORT = 8100
FILTER_MAP = {
"pdf": {
FAMILY_TEXT: "writer_pdf_Export",
FAMILY_SPREADSHEET: "calc_pdf_Export",
FAMILY_PRESENTATION: "impress_pdf_Export",
FAMILY_DRAWING: "draw_pdf_Export"
},
"html": {
FAMILY_TEXT: "HTML (StarWriter)",
FAMILY_SPREADSHEET: "HTML (StarCalc)",
FAMILY_PRESENTATION: "impress_html_Export"
},
"odt": { FAMILY_TEXT: "writer8" },
"doc": { FAMILY_TEXT: "MS Word 97" },
"rtf": { FAMILY_TEXT: "Rich Text Format" },
"txt": { FAMILY_TEXT: "Text" },
"docx": { FAMILY_TEXT: "MS Word 2007 XML" },
"ods": { FAMILY_SPREADSHEET: "calc8" },
"xls": { FAMILY_SPREADSHEET: "MS Excel 97" },
"odp": { FAMILY_PRESENTATION: "impress8" },
"ppt": { FAMILY_PRESENTATION: "MS PowerPoint 97" },
"swf": { FAMILY_PRESENTATION: "impress_flash_Export" }
}
class DocumentConverter:
def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
localContext = uno.getComponentContext()
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
try:
self.context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
except NoConnectException:
raise Exception, "failed to connect to OpenOffice.org on port %s" % port
self.desktop = self.context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", self.context)
def convert(self, inputFile, outputFile):
inputUrl = self._toFileUrl(inputFile)
outputUrl = self._toFileUrl(outputFile)
document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(Hidden=True))
#document.setPropertyValue("DocumentTitle", "saf" ) TODO: Check how this can be set and set doc update mode to FULL_UPDATE
if self._detectFamily(document) == FAMILY_TEXT:
indexes = document.getDocumentIndexes()
for i in range(0, indexes.getCount()):
index = indexes.getByIndex(i)
index.update()
try:
document.refresh()
except AttributeError:
pass
indexes = document.getDocumentIndexes()
for i in range(0, indexes.getCount()):
index = indexes.getByIndex(i)
index.update()
outputExt = self._getFileExt(outputFile)
filterName = self._filterName(document, outputExt)
try:
document.storeToURL(outputUrl, self._toProperties(FilterName=filterName))
finally:
document.close(True)
def _filterName(self, document, outputExt):
family = self._detectFamily(document)
try:
filterByFamily = FILTER_MAP[outputExt]
except KeyError:
raise Exception, "unknown output format: '%s'" % outputExt
try:
return filterByFamily[family]
except KeyError:
raise Exception, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
def _detectFamily(self, document):
if document.supportsService("com.sun.star.text.GenericTextDocument"):
# NOTE: a GenericTextDocument is either a TextDocument, a WebDocument, or a GlobalDocument
# but this further distinction doesn't seem to matter for conversions
return FAMILY_TEXT
if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
return FAMILY_SPREADSHEET
if document.supportsService("com.sun.star.presentation.PresentationDocument"):
return FAMILY_PRESENTATION
if document.supportsService("com.sun.star.drawing.DrawingDocument"):
return FAMILY_DRAWING
raise Exception, "unknown document family: %s" % document
def _getFileExt(self, path):
ext = splitext(path)[1]
if ext is not None:
return ext[1:].lower()
def _toFileUrl(self, path):
return uno.systemPathToFileUrl(abspath(path))
def _toProperties(self, **args):
props = []
for key in args:
prop = PropertyValue()
prop.Name = key
prop.Value = args[key]
props.append(prop)
return tuple(props)
if __name__ == "__main__":
from sys import argv, exit
if len(argv) < 3:
print "USAGE: python %s <input-file> <output-file>" % argv[0]
exit(255)
if not isfile(argv[1]):
print "no such input file: %s" % argv[1]
exit(1)
try:
converter = DocumentConverter()
converter.convert(argv[1], argv[2])
except Exception, exception:
print "ERROR!" + str(exception)
exit(1)
您需要OpenOffice Uno Bride(在服务器上,您可以在无头模式下运行OO)。
因此,您可以将每个OO可读格式转换为每个OO可写格式:
看
运行示例代码
/usr/lib64/openoffice.org/program/soffice.bin -accept=socket,host=localhost,port=8100\;urp -headless
Python示例:
import uno
from os.path import abspath, isfile, splitext
from com.sun.star.beans import PropertyValue
from com.sun.star.task import ErrorCodeIOException
from com.sun.star.connection import NoConnectException
FAMILY_TEXT = "Text"
FAMILY_SPREADSHEET = "Spreadsheet"
FAMILY_PRESENTATION = "Presentation"
FAMILY_DRAWING = "Drawing"
DEFAULT_OPENOFFICE_PORT = 8100
FILTER_MAP = {
"pdf": {
FAMILY_TEXT: "writer_pdf_Export",
FAMILY_SPREADSHEET: "calc_pdf_Export",
FAMILY_PRESENTATION: "impress_pdf_Export",
FAMILY_DRAWING: "draw_pdf_Export"
},
"html": {
FAMILY_TEXT: "HTML (StarWriter)",
FAMILY_SPREADSHEET: "HTML (StarCalc)",
FAMILY_PRESENTATION: "impress_html_Export"
},
"odt": { FAMILY_TEXT: "writer8" },
"doc": { FAMILY_TEXT: "MS Word 97" },
"rtf": { FAMILY_TEXT: "Rich Text Format" },
"txt": { FAMILY_TEXT: "Text" },
"docx": { FAMILY_TEXT: "MS Word 2007 XML" },
"ods": { FAMILY_SPREADSHEET: "calc8" },
"xls": { FAMILY_SPREADSHEET: "MS Excel 97" },
"odp": { FAMILY_PRESENTATION: "impress8" },
"ppt": { FAMILY_PRESENTATION: "MS PowerPoint 97" },
"swf": { FAMILY_PRESENTATION: "impress_flash_Export" }
}
class DocumentConverter:
def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
localContext = uno.getComponentContext()
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
try:
self.context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
except NoConnectException:
raise Exception, "failed to connect to OpenOffice.org on port %s" % port
self.desktop = self.context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", self.context)
def convert(self, inputFile, outputFile):
inputUrl = self._toFileUrl(inputFile)
outputUrl = self._toFileUrl(outputFile)
document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(Hidden=True))
#document.setPropertyValue("DocumentTitle", "saf" ) TODO: Check how this can be set and set doc update mode to FULL_UPDATE
if self._detectFamily(document) == FAMILY_TEXT:
indexes = document.getDocumentIndexes()
for i in range(0, indexes.getCount()):
index = indexes.getByIndex(i)
index.update()
try:
document.refresh()
except AttributeError:
pass
indexes = document.getDocumentIndexes()
for i in range(0, indexes.getCount()):
index = indexes.getByIndex(i)
index.update()
outputExt = self._getFileExt(outputFile)
filterName = self._filterName(document, outputExt)
try:
document.storeToURL(outputUrl, self._toProperties(FilterName=filterName))
finally:
document.close(True)
def _filterName(self, document, outputExt):
family = self._detectFamily(document)
try:
filterByFamily = FILTER_MAP[outputExt]
except KeyError:
raise Exception, "unknown output format: '%s'" % outputExt
try:
return filterByFamily[family]
except KeyError:
raise Exception, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
def _detectFamily(self, document):
if document.supportsService("com.sun.star.text.GenericTextDocument"):
# NOTE: a GenericTextDocument is either a TextDocument, a WebDocument, or a GlobalDocument
# but this further distinction doesn't seem to matter for conversions
return FAMILY_TEXT
if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
return FAMILY_SPREADSHEET
if document.supportsService("com.sun.star.presentation.PresentationDocument"):
return FAMILY_PRESENTATION
if document.supportsService("com.sun.star.drawing.DrawingDocument"):
return FAMILY_DRAWING
raise Exception, "unknown document family: %s" % document
def _getFileExt(self, path):
ext = splitext(path)[1]
if ext is not None:
return ext[1:].lower()
def _toFileUrl(self, path):
return uno.systemPathToFileUrl(abspath(path))
def _toProperties(self, **args):
props = []
for key in args:
prop = PropertyValue()
prop.Name = key
prop.Value = args[key]
props.append(prop)
return tuple(props)
if __name__ == "__main__":
from sys import argv, exit
if len(argv) < 3:
print "USAGE: python %s <input-file> <output-file>" % argv[0]
exit(255)
if not isfile(argv[1]):
print "no such input file: %s" % argv[1]
exit(1)
try:
converter = DocumentConverter()
converter.convert(argv[1], argv[2])
except Exception, exception:
print "ERROR!" + str(exception)
exit(1)
导入uno
从os.path导入abspath、isfile、splitext
从com.sun.star.beans导入PropertyValue
从com.sun.star.task导入ErrorCodeIOException
从com.sun.star.connection导入NoConnectionException
FAMILY_TEXT=“TEXT”
族\ u电子表格=“电子表格”
家庭展示=“展示”
族_DRAWING=“绘图”
默认\u OPENOFFICE\u端口=8100
过滤器映射={
“pdf”:{
家庭文本:“writer\u pdf\u导出”,
家庭电子表格:“calc\u pdf\u导出”,
家庭演示:“impress\u pdf\u导出”,
族\图形:“绘制\ pdf \导出”
},
“html”:{
家庭文本:“HTML(StarWriter)”,
家庭电子表格:“HTML(StarCalc)”,
家庭演示:“印象”\u html\u导出”
},
“odt”:{FAMILY_TEXT:“writer8”},
“doc”:{家庭文字:“MS Word 97”},
“rtf”:{FAMILY_TEXT:“富文本格式”},
“txt”:{FAMILY_TEXT:“TEXT”},
“docx”:{FAMILY_TEXT:“MS Word 2007 XML”},
“ods”:{家庭电子表格:“calc8”},
“xls”:{家庭电子表格:“MS Excel 97”},
“odp”:{家庭介绍:“印象8”},
“ppt”:{家庭演示文稿:“MS PowerPoint 97”},
“swf”:{家庭介绍:“印象深刻的闪光输出”}
}
类文档转换器:
def u u init u uuu(self,port=DEFAULT_OPENOFFICE_port):
localContext=uno.getComponentContext()
resolver=localContext.ServiceManager.createInstanceWithContext(“com.sun.star.bridge.UnoUrlResolver”,localContext)
尝试:
self.context=resolver.resolve(“uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext”%port)
除无连接例外:
引发异常,“无法连接到端口%s“%port”上的OpenOffice.org
self.desktop=self.context.ServiceManager.createInstanceWithContext(“com.sun.star.frame.desktop”,self.context)
def转换(自身、输入文件、输出文件):
inputUrl=self.\u文件URL(inputFile)
outputUrl=self.\u toFileUrl(outputFile)
document=self.desktop.loadComponentFromURL(inputUrl,“\u blank”,0,self.\u TopProperties(Hidden=True))
#setPropertyValue(“DocumentTitle”、“saf”)TODO:检查如何设置此选项,并将文档更新模式设置为完全更新
如果自检测家庭(文档)=家庭文本:
index=document.getDocumentIndexes()
对于范围(0,index.getCount())中的i:
index=index.getByIndex(i)
index.update()
尝试:
document.refresh()
除属性错误外:
通过
index=document.getDocumentIndexes()
对于范围(0,index.getCount())中的i:
index=index.getByIndex(i)
index.update()
outputExt=self.\u getFileExt(outputFile)
filterName=self.\u filterName(文档,outputExt)
尝试:
document.storeToURL(outputUrl,self.\u-toProperty(FilterName=FilterName))
最后:
文档关闭(True)
定义过滤器名称(自身、文档、输出文本):
家庭=自身。\u检测家庭(文件)
尝试:
filterByFamily=过滤器映射[outputExt]
除KeyError外:
引发异常,“未知输出格式:'%s'%OutputText”
尝试:
返回过滤器家庭[家庭]
除KeyError外:
引发异常,“不支持的转换:从'%s'到'%s'”(系列,outputExt)
def_detectFamily(自身、文档):
if document.supportsService(“com.sun.star.text.GenericTextDocument”):
#注意:GenericTextDocument可以是TextDocument、WebDocument或GlobalDocument
#但这种进一步的区别似乎对转换并不重要
返回族文本
if document.supportsService(“com.sun.star.sheet.SpreadsheetDocument”):
返回家庭电子表格
如果document.supportsService(“com.sun.star.presentation.PresentationDocument”):
返回家庭演示文稿
if document.supportsService(“com.sun.star.drawing.DrawingDocument”):
返回族图
引发异常,“未知文档系列:%s”%document
def_getFileExt(self,path):
ext=拆分ext(路径)[1]
如果ext不是None:
return ext[1:][.lower()
def(自身,路径):
返回uno.systemPathToFileUrl(abspath(path))
定义属性(自身,**参数):
道具=[]
对于输入参数:
prop=属性值()
prop.Name=key
属性值=参数[键]