Java 通过TIKA将word文档转换为带有嵌入图像的HTML
我刚到提卡。我尝试使用Tika将Microsoft word文档转换为HTML。我正在使用TikaOnDotNet包装器在.Net框架上使用TIKA。我的转换代码如下所示:Java 通过TIKA将word文档转换为带有嵌入图像的HTML,java,c#,apache-tika,Java,C#,Apache Tika,我刚到提卡。我尝试使用Tika将Microsoft word文档转换为HTML。我正在使用TikaOnDotNet包装器在.Net框架上使用TIKA。我的转换代码如下所示: byte[] file = Files.toByteArray(new File(@"myPath\document.doc")); AutoDetectParser tikaParser = new AutoDetectParser(); ByteArrayOutputSt
byte[] file = Files.toByteArray(new File(@"myPath\document.doc"));
AutoDetectParser tikaParser = new AutoDetectParser();
ByteArrayOutputStream output = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(output));
ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
tikaParser.parse(new ByteArrayInputStream(file), handler1, new Metadata());
File ofile = new File(@"C:\toHtml\text.html");
ofile.createNewFile();
DataOutputStream stream = new DataOutputStream(new FileOutputStream(ofile));
output.writeTo(stream);
除了嵌入的图像,其他一切都正常工作。生成的HTML包含图像标记,如:
<img src="embedded:image2.wmf" alt="image2.wmf"/>
但图像源不存在。请告诉我,学分归@Gagravarr 请注意,这是一个简单的代码实现,原始代码可以在问题注释中找到 此实现基于TikaOnDotNet包装器
public class DocToHtml
{
private TikaConfig config = TikaConfig.getDefaultConfig();
public void Convert()
{
byte[] file = Files.toByteArray(new File(@"filename.doc"));
AutoDetectParser tikaParser = new AutoDetectParser();
ByteArrayOutputStream output = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
var inputStream = new ByteArrayInputStream(file);
// ToHTMLContentHandler handler = new ToHTMLContentHandler();
var metaData = new Metadata();
EncodingDetector encodingDetector = new UniversalEncodingDetector();
var encode = encodingDetector.detect(inputStream, metaData) ?? new UTF_32();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encode.toString());
handler.setResult(new StreamResult(output));
ContentHandler imageRewriting = new ImageRewritingContentHandler(handler);
// ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
ParseContext context = new ParseContext();
context.set(typeof(EmbeddedDocumentExtractor), new FileEmbeddedDocumentEtractor());
tikaParser.parse(inputStream, imageRewriting, new Metadata(), context);
byte[] array = output.toByteArray();
System.IO.File.WriteAllBytes(@"C:\toHtml\text.html", array);
}
private class ImageRewritingContentHandler : ContentHandlerDecorator
{
public ImageRewritingContentHandler(ContentHandler handler) : base(handler)
{
}
public override void startElement(string uri, string localName, string name, Attributes origAttrs)
{
if ("img".Equals(localName))
{
AttributesImpl attrs;
if (origAttrs is AttributesImpl)
attrs = (AttributesImpl)origAttrs;
else
attrs = new AttributesImpl(origAttrs);
for (int i = 0; i < attrs.getLength(); i++)
{
if ("src".Equals(attrs.getLocalName(i)))
{
String src = attrs.getValue(i);
if (src.StartsWith("embedded:"))
{
var newSrc = src.Replace("embedded:", @"images\");
attrs.setValue(i, newSrc);
}
}
}
attrs.addAttribute(null, "width", "width","width", "100px");
base.startElement(uri, localName, name, attrs);
}
else
base.startElement(uri, localName, name, origAttrs);
}
}
private class FileEmbeddedDocumentEtractor : EmbeddedDocumentExtractor
{
private int count = 0;
public bool shouldParseEmbedded(Metadata m)
{
return true;
}
public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, bool outputHtml)
{
Detector detector = new DefaultDetector();
string name = metadata.get("resourceName");
MediaType contentType = detector.detect(inputStream, metadata);
if (contentType.getType() != "image") return;
var embeddedFile = name;
File outputFile = new File(@"C:\toHtml\images", embeddedFile);
try
{
using (FileOutputStream os = new FileOutputStream(outputFile))
{
var tin = inputStream as TikaInputStream;
if (tin != null)
{
if (tin.getOpenContainer() != null && tin.getOpenContainer() is DirectoryEntry)
{
POIFSFileSystem fs = new POIFSFileSystem();
fs.writeFilesystem(os);
}
else
{
IOUtils.copy(inputStream, os);
}
}
}
}
catch (Exception ex)
{
throw;
}
}
}
}
公共类DocToHtml
{
私有TikaConfig config=TikaConfig.getDefaultConfig();
公共void Convert()
{
字节[]文件=Files.toByteArray(新文件(@“filename.doc”);
AutoDetectPasser tikaParser=新的AutoDetectPasser();
ByteArrayOutputStream输出=新建ByteArrayOutputStream();
SAXSTransformerFactory=(SAXSTransformerFactory)TransformerFactory.newInstance();
var inputStream=new ByteArrayInputStream(文件);
//ToHTMLContentHandler=新的ToHTMLContentHandler();
var metaData=新元数据();
EncodingDetector EncodingDetector=新的通用编码检测器();
var encode=encodingDetector.detect(inputStream,元数据)??新的UTF_32();
TransformerHandler=factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD,“html”);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT,“是”);
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING,encode.toString());
setResult(新的StreamResult(输出));
ContentHandler imageRewriting=新的ImageRewritingContentHandler(handler);
//ExpandedTitleContentHandler handler1=新的ExpandedTitleContentHandler(handler);
ParseContext=新的ParseContext();
set(typeof(EmbeddedDocumentExtractor),新文件EmbeddedDocumentTractor());
parse(inputStream,imageRewriting,new Metadata(),context);
字节[]数组=输出。toByteArray();
System.IO.File.writealBytes(@“C:\toHtml\text.html”,数组);
}
私有类ImageRewritingContentHandler:ContentHandlerDecorator
{
公共图像重写ContentHandler(ContentHandler):基(handler)
{
}
public override void startElement(字符串uri、字符串localName、字符串名称、属性origAttrs)
{
if(“img”.Equals(localName))
{
属性impl attrs;
如果(origAttrs是AttributesImpl)
attrs=(AttributesImpl)origAttrs;
其他的
attrs=新属性impl(origAttrs);
对于(int i=0;i
您是否忘记在ParseContext
上设置一个合适的参数来指定要保存哪些资源以及保存在哪里?@Gagravarr您能给我举个例子吗?我认为这是主要原因。TikaCLI
有一个,例如@Gagravarr谢谢,我设法从doc中提取图像,但在HTMl中,如何更改每个提取图像的src?您也需要自己的自定义SAX处理程序。我能想到的最好的例子