如何在java中读取或解析MHTML(.mht)文件
我需要挖掘大多数已知文档文件的内容,如:如何在java中读取或解析MHTML(.mht)文件,java,parsing,compression,mhtml,Java,Parsing,Compression,Mhtml,我需要挖掘大多数已知文档文件的内容,如: pdf html doc/docx等 对于我计划使用的大多数文件格式: 但是到目前为止,Tika不支持MHTML(*.mht)文件。。( ) C#()中的示例很少,但我在Java中没有找到 我尝试用7Zip打开*.mht文件,但失败了…尽管WinZip能够将文件解压缩为图像和文本(CSS、HTML、脚本)作为文本和二进制文件 根据MSDN第页()和我前面提到的代码项目第页。。。mht文件使用GZip压缩 尝试在java中解压缩会导致以下异常: 使用j
Tika
不支持MHTML(*.mht)文件。。( )
C#()中的示例很少,但我在Java中没有找到
我尝试用7Zip打开*.mht文件,但失败了…尽管WinZip能够将文件解压缩为图像和文本(CSS、HTML、脚本)作为文本和二进制文件
根据MSDN第页()和我前面提到的代码项目
第页。。。mht文件使用GZip压缩
尝试在java中解压缩会导致以下异常:
使用java.uti.zip.gzip输入流
java.io.IOException: Not in GZIP format
at java.util.zip.GZIPInputStream.readHeader(Unknown Source)
at java.util.zip.GZIPInputStream.<init>(Unknown Source)
at java.util.zip.GZIPInputStream.<init>(Unknown Source)
at GZipTest.main(GZipTest.java:16)
请建议如何解压
谢谢……你可以试试,它可以打包/解包,你可以像处理普通文件一样处理它。下载链接是:我被用来解析/读取/索引mht文件(但作为普通文件,而不是压缩文件)坦白地说,我不希望在不久的将来找到解决方案,我打算放弃,但我是如何在这一页上绊倒的: 虽然,第一眼看上去不是很吸引人。但如果你仔细看,你会得到线索。读完这篇文章后,我启动了IE,随机开始将页面保存为
*.mht
文件。让我一行一行去
但是让我事先解释一下,我的最终目标是分离出html
内容并对其进行解析。。。解决方案本身并不完整,因为它取决于保存时选择的字符集
或编码。但即使它会提取带有小故障的单个文件
我希望这对任何试图解析/解压*.mht/MHTML
文件的人都有用:)
=======说明========
**取自mht文件**
From: "Saved by Windows Internet Explorer 7"
它是用于保存文件的软件
Subject: Google
Date: Tue, 13 Jul 2010 21:23:03 +0530
MIME-Version: 1.0
主题、日期和mime版本…与邮件格式非常相似
Content-Type: multipart/related;
type="text/html";
这部分告诉我们它是一个多部分
文档。多部分文档在单个正文中组合了一组或多组不同的数据,实体的标题中必须出现multipart
内容类型字段。在这里,我们还可以看到类型为“text/html”
这其中最重要的部分。这是将两个不同部分(html、图像、css、脚本等)分开的唯一分隔符一旦你掌握了这个,一切都会变得轻松。。。现在,我只需遍历文档,找出不同的部分,并根据它们的内容传输编码保存它们(base64,可打印引用等)。。。
.
.
样本
------=_NextPart_000_0007_01CB22D1.93BBD1A0
Content-Type: text/html;
charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Content-Location: http://www.google.com/webhp?sourceid=navclient&ie=UTF-8
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" =
.
.
.
主解析器类
/**
* This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*/
package com.test.mht.core;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import sun.misc.BASE64Decoder;
/**
* File to parse and decompose *.mts file in its constituting parts.
* @author Manish Shukla
*/
public class MHTParser implements IConstants
{
private File mhtFile;
private File outputFolder;
public MHTParser(File mhtFile, File outputFolder) {
this.mhtFile = mhtFile;
this.outputFolder = outputFolder;
}
/**
* @throws Exception
*/
public void decompress() throws Exception
{
BufferedReader reader = null;
String type = "";
String encoding = "";
String location = "";
String filename = "";
String charset = "utf-8";
StringBuilder buffer = null;
try
{
reader = new BufferedReader(new FileReader(mhtFile));
final String boundary = getBoundary(reader);
if(boundary == null)
throw new Exception("Failed to find document 'boundary'... Aborting");
String line = null;
int i = 1;
while((line = reader.readLine()) != null)
{
String temp = line.trim();
if(temp.contains(boundary))
{
if(buffer != null) {
writeBufferContentToFile(buffer,encoding,filename,charset);
buffer = null;
}
buffer = new StringBuilder();
}else if(temp.startsWith(CONTENT_TYPE)) {
type = getType(temp);
}else if(temp.startsWith(CHAR_SET)) {
charset = getCharSet(temp);
}else if(temp.startsWith(CONTENT_TRANSFER_ENCODING)) {
encoding = getEncoding(temp);
}else if(temp.startsWith(CONTENT_LOCATION)) {
location = temp.substring(temp.indexOf(":")+1).trim();
i++;
filename = getFileName(location,type);
}else {
if(buffer != null) {
buffer.append(line + "\n");
}
}
}
}finally
{
if(null != reader)
reader.close();
}
}
private String getCharSet(String temp)
{
String t = temp.split("=")[1].trim();
return t.substring(1, t.length()-1);
}
/**
* Save the file as per character set and encoding
*/
private void writeBufferContentToFile(StringBuilder buffer,String encoding, String filename, String charset)
throws Exception
{
if(!outputFolder.exists())
outputFolder.mkdirs();
byte[] content = null;
boolean text = true;
if(encoding.equalsIgnoreCase("base64")){
content = getBase64EncodedString(buffer);
text = false;
}else if(encoding.equalsIgnoreCase("quoted-printable")) {
content = getQuotedPrintableString(buffer);
}
else
content = buffer.toString().getBytes();
if(!text)
{
BufferedOutputStream bos = null;
try
{
bos = new BufferedOutputStream(new FileOutputStream(filename));
bos.write(content);
bos.flush();
}finally {
bos.close();
}
}else
{
BufferedWriter bw = null;
try
{
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), charset));
bw.write(new String(content));
bw.flush();
}finally {
bw.close();
}
}
}
/**
* When the save the *.mts file with 'utf-8' encoding then it appends '=EF=BB=BF'</br>
* @see http://en.wikipedia.org/wiki/Byte_order_mark
*/
private byte[] getQuotedPrintableString(StringBuilder buffer)
{
//Set<String> uniqueHex = new HashSet<String>();
//final Pattern p = Pattern.compile("(=\\p{XDigit}{2})*");
String temp = buffer.toString().replaceAll(UTF8_BOM, "").replaceAll("=\n", "");
//Matcher m = p.matcher(temp);
//while(m.find()) {
// uniqueHex.add(m.group());
//}
//System.out.println(uniqueHex);
//for (String hex : uniqueHex) {
//temp = temp.replaceAll(hex, getASCIIValue(hex.substring(1)));
//}
return temp.getBytes();
}
/*private String getASCIIValue(String hex) {
return ""+(char)Integer.parseInt(hex, 16);
}*/
/**
* Although system dependent..it works well
*/
private byte[] getBase64EncodedString(StringBuilder buffer) throws Exception {
return new BASE64Decoder().decodeBuffer(buffer.toString());
}
/**
* Tries to get a qualified file name. If the name is not apparent it tries to guess it from the URL.
* Otherwise it returns 'unknown.<type>'
*/
private String getFileName(String location, String type)
{
final Pattern p = Pattern.compile("(\\w|_|-)+\\.\\w+");
String ext = "";
String name = "";
if(type.toLowerCase().endsWith("jpeg"))
ext = "jpg";
else
ext = type.split("/")[1];
if(location.endsWith("/")) {
name = "main";
}else
{
name = location.substring(location.lastIndexOf("/") + 1);
Matcher m = p.matcher(name);
String fname = "";
while(m.find()) {
fname = m.group();
}
if(fname.trim().length() == 0)
name = "unknown";
else
return getUniqueName(fname.substring(0,fname.indexOf(".")), fname.substring(fname.indexOf(".") + 1, fname.length()));
}
return getUniqueName(name,ext);
}
/**
* Returns a qualified unique output file path for the parsed path.</br>
* In case the file already exist it appends a numarical value a continues
*/
private String getUniqueName(String name,String ext)
{
int i = 1;
File file = new File(outputFolder,name + "." + ext);
if(file.exists())
{
while(true)
{
file = new File(outputFolder, name + i + "." + ext);
if(!file.exists())
return file.getAbsolutePath();
i++;
}
}
return file.getAbsolutePath();
}
private String getType(String line) {
return splitUsingColonSpace(line);
}
private String getEncoding(String line){
return splitUsingColonSpace(line);
}
private String splitUsingColonSpace(String line) {
return line.split(":\\s*")[1].replaceAll(";", "");
}
/**
* Gives you the boundary string
*/
private String getBoundary(BufferedReader reader) throws Exception
{
String line = null;
while((line = reader.readLine()) != null)
{
line = line.trim();
if(line.startsWith(BOUNDARY)) {
return line.substring(line.indexOf("\"") + 1, line.lastIndexOf("\""));
}
}
return null;
}
}
/**
*本程序及其附带的资料是根据Eclipse公共许可证v1.0的条款提供的
*随本发行版发行,可在
* http://www.eclipse.org/legal/epl-v10.html
*/
包com.test.mht.core;
导入java.io.BufferedOutputStream;
导入java.io.BufferedReader;
导入java.io.BufferedWriter;
导入java.io.File;
导入java.io.FileOutputStream;
导入java.io.FileReader;
导入java.io.OutputStreamWriter;
导入java.util.regex.Matcher;
导入java.util.regex.Pattern;
导入sun.misc.base64解码器;
/**
*要在其组成部分中解析和分解*.mts文件的文件。
*@作者Manish Shukla
*/
公共类MHTParser实现IConstants
{
私有文件mhtFile;
私有文件输出文件夹;
公共MHTParser(文件mhtFile、文件输出文件夹){
this.mhtFile=mhtFile;
this.outputFolder=outputFolder;
}
/**
*@抛出异常
*/
public void decompress()引发异常
{
BufferedReader reader=null;
字符串类型=”;
字符串编码=”;
字符串位置=”;
字符串filename=“”;
字符串charset=“utf-8”;
StringBuilder缓冲区=null;
尝试
{
reader=newbufferedreader(newfilereader(mhtFile));
最终字符串边界=getBoundary(读卡器);
如果(边界==null)
抛出新异常(“未能找到文档“边界”…正在中止”);
字符串行=null;
int i=1;
而((line=reader.readLine())!=null)
{
字符串温度=line.trim();
如果(温度包含(边界))
{
if(缓冲区!=null){
writeBufferContentToFile(缓冲区、编码、文件名、字符集);
缓冲区=空;
}
缓冲区=新的StringBuilder();
}else if(临时启动(内容类型)){
type=getType(temp);
}else if(临时启动(字符集)){
charset=getCharSet(temp);
}else if(临时启动(内容\传输\编码)){
编码=getEncoding(临时);
}else if(临时启动(内容\位置)){
location=temp.substring(temp.indexOf(“:”)+1.trim();
i++;
filename=getFileName(位置、类型);
}否则{
if(缓冲区!=null){
buffer.append(第+行“\n”);
}
}
}
}最后
{
if(null!=读取器)
reader.close();
}
}
私有字符串getCharSet(字符串临时值)
{
字符串t=temp.split(“=”[1]。trim();
返回t.substring(1,t.length()-1);
}
/**
*按照字符集和编码保存文件
*/
私有void writeBufferContentToFile(StringBuilder缓冲区、字符串编码、字符串文件名、字符串字符
------=_NextPart_000_0007_01CB22D1.93BBD1A0
Content-Type: text/html;
charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Content-Location: http://www.google.com/webhp?sourceid=navclient&ie=UTF-8
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" =
.
.
.
public interface IConstants
{
public String BOUNDARY = "boundary";
public String CHAR_SET = "charset";
public String CONTENT_TYPE = "Content-Type";
public String CONTENT_TRANSFER_ENCODING = "Content-Transfer-Encoding";
public String CONTENT_LOCATION = "Content-Location";
public String UTF8_BOM = "=EF=BB=BF";
public String UTF16_BOM1 = "=FF=FE";
public String UTF16_BOM2 = "=FE=FF";
}
/**
* This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*/
package com.test.mht.core;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import sun.misc.BASE64Decoder;
/**
* File to parse and decompose *.mts file in its constituting parts.
* @author Manish Shukla
*/
public class MHTParser implements IConstants
{
private File mhtFile;
private File outputFolder;
public MHTParser(File mhtFile, File outputFolder) {
this.mhtFile = mhtFile;
this.outputFolder = outputFolder;
}
/**
* @throws Exception
*/
public void decompress() throws Exception
{
BufferedReader reader = null;
String type = "";
String encoding = "";
String location = "";
String filename = "";
String charset = "utf-8";
StringBuilder buffer = null;
try
{
reader = new BufferedReader(new FileReader(mhtFile));
final String boundary = getBoundary(reader);
if(boundary == null)
throw new Exception("Failed to find document 'boundary'... Aborting");
String line = null;
int i = 1;
while((line = reader.readLine()) != null)
{
String temp = line.trim();
if(temp.contains(boundary))
{
if(buffer != null) {
writeBufferContentToFile(buffer,encoding,filename,charset);
buffer = null;
}
buffer = new StringBuilder();
}else if(temp.startsWith(CONTENT_TYPE)) {
type = getType(temp);
}else if(temp.startsWith(CHAR_SET)) {
charset = getCharSet(temp);
}else if(temp.startsWith(CONTENT_TRANSFER_ENCODING)) {
encoding = getEncoding(temp);
}else if(temp.startsWith(CONTENT_LOCATION)) {
location = temp.substring(temp.indexOf(":")+1).trim();
i++;
filename = getFileName(location,type);
}else {
if(buffer != null) {
buffer.append(line + "\n");
}
}
}
}finally
{
if(null != reader)
reader.close();
}
}
private String getCharSet(String temp)
{
String t = temp.split("=")[1].trim();
return t.substring(1, t.length()-1);
}
/**
* Save the file as per character set and encoding
*/
private void writeBufferContentToFile(StringBuilder buffer,String encoding, String filename, String charset)
throws Exception
{
if(!outputFolder.exists())
outputFolder.mkdirs();
byte[] content = null;
boolean text = true;
if(encoding.equalsIgnoreCase("base64")){
content = getBase64EncodedString(buffer);
text = false;
}else if(encoding.equalsIgnoreCase("quoted-printable")) {
content = getQuotedPrintableString(buffer);
}
else
content = buffer.toString().getBytes();
if(!text)
{
BufferedOutputStream bos = null;
try
{
bos = new BufferedOutputStream(new FileOutputStream(filename));
bos.write(content);
bos.flush();
}finally {
bos.close();
}
}else
{
BufferedWriter bw = null;
try
{
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), charset));
bw.write(new String(content));
bw.flush();
}finally {
bw.close();
}
}
}
/**
* When the save the *.mts file with 'utf-8' encoding then it appends '=EF=BB=BF'</br>
* @see http://en.wikipedia.org/wiki/Byte_order_mark
*/
private byte[] getQuotedPrintableString(StringBuilder buffer)
{
//Set<String> uniqueHex = new HashSet<String>();
//final Pattern p = Pattern.compile("(=\\p{XDigit}{2})*");
String temp = buffer.toString().replaceAll(UTF8_BOM, "").replaceAll("=\n", "");
//Matcher m = p.matcher(temp);
//while(m.find()) {
// uniqueHex.add(m.group());
//}
//System.out.println(uniqueHex);
//for (String hex : uniqueHex) {
//temp = temp.replaceAll(hex, getASCIIValue(hex.substring(1)));
//}
return temp.getBytes();
}
/*private String getASCIIValue(String hex) {
return ""+(char)Integer.parseInt(hex, 16);
}*/
/**
* Although system dependent..it works well
*/
private byte[] getBase64EncodedString(StringBuilder buffer) throws Exception {
return new BASE64Decoder().decodeBuffer(buffer.toString());
}
/**
* Tries to get a qualified file name. If the name is not apparent it tries to guess it from the URL.
* Otherwise it returns 'unknown.<type>'
*/
private String getFileName(String location, String type)
{
final Pattern p = Pattern.compile("(\\w|_|-)+\\.\\w+");
String ext = "";
String name = "";
if(type.toLowerCase().endsWith("jpeg"))
ext = "jpg";
else
ext = type.split("/")[1];
if(location.endsWith("/")) {
name = "main";
}else
{
name = location.substring(location.lastIndexOf("/") + 1);
Matcher m = p.matcher(name);
String fname = "";
while(m.find()) {
fname = m.group();
}
if(fname.trim().length() == 0)
name = "unknown";
else
return getUniqueName(fname.substring(0,fname.indexOf(".")), fname.substring(fname.indexOf(".") + 1, fname.length()));
}
return getUniqueName(name,ext);
}
/**
* Returns a qualified unique output file path for the parsed path.</br>
* In case the file already exist it appends a numarical value a continues
*/
private String getUniqueName(String name,String ext)
{
int i = 1;
File file = new File(outputFolder,name + "." + ext);
if(file.exists())
{
while(true)
{
file = new File(outputFolder, name + i + "." + ext);
if(!file.exists())
return file.getAbsolutePath();
i++;
}
}
return file.getAbsolutePath();
}
private String getType(String line) {
return splitUsingColonSpace(line);
}
private String getEncoding(String line){
return splitUsingColonSpace(line);
}
private String splitUsingColonSpace(String line) {
return line.split(":\\s*")[1].replaceAll(";", "");
}
/**
* Gives you the boundary string
*/
private String getBoundary(BufferedReader reader) throws Exception
{
String line = null;
while((line = reader.readLine()) != null)
{
line = line.trim();
if(line.startsWith(BOUNDARY)) {
return line.substring(line.indexOf("\"") + 1, line.lastIndexOf("\""));
}
}
return null;
}
}
<dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j</artifactId>
<version>0.7.2</version>
</dependency>
public static void main(String[] args)
{
MessageTree.main(new String[]{"YOU MHT FILE PATH"});
}
/**
* Displays a parsed Message in a window. The window will be divided into
* two panels. The left panel displays the Message tree. Clicking on a
* node in the tree shows information on that node in the right panel.
*
* Some of this code have been copied from the Java tutorial's JTree section.
*/
import org.apache.james.mime4j.dom.Message
import org.apache.james.mime4j.dom.Multipart
import org.apache.james.mime4j.dom.field.ContentTypeField
import org.apache.james.mime4j.message.DefaultMessageBuilder
import org.apache.james.mime4j.stream.MimeConfig
/**
* Use Mime4J MessageBuilder to parse an mhtml file (assumes multipart) into
* separate html files.
* Files will be written to outDir (or parent) as baseName + partIdx + ext.
*/
void parseMhtToFile(File mhtFile, File outDir = null) {
if (!outDir) {outDir = mhtFile.parentFile }
// File baseName will be used in generating new filenames
def mhtBaseName = mhtFile.name.replaceFirst(~/\.[^\.]+$/, '')
// -- Set up Mime parser, using Default Message Builder
MimeConfig parserConfig = new MimeConfig();
parserConfig.setMaxHeaderLen(-1); // The default is a mere 10k
parserConfig.setMaxLineLen(-1); // The default is only 1000 characters.
parserConfig.setMaxHeaderCount(-1); // Disable the check for header count.
DefaultMessageBuilder builder = new DefaultMessageBuilder();
builder.setMimeEntityConfig(parserConfig);
// -- Parse the MHT stream data into a Message object
println "Parsing ${mhtFile}...";
InputStream mhtStream = mhtFile.newInputStream()
Message message = builder.parseMessage(mhtStream);
// -- Process the resulting body parts, writing to file
assert message.getBody() instanceof Multipart
Multipart multipart = (Multipart) message.getBody();
def parts = multipart.getBodyParts();
parts.eachWithIndex { p, i ->
ContentTypeField cType = p.header.getField('content-type')
println "${p.class.simpleName}\t${i}\t${cType.mimeType}"
// Assume mime sub-type is a "good enough" file-name extension
// e.g. text/html = html, image/png = png, application/json = json
String partFileName = "${mhtBaseName}_${i}.${cType.subType}"
File partFile = new File(outDir, partFileName)
// Write part body stream to file
println "Writing ${partFile}...";
if (partFile.exists()) partFile.delete();
InputStream partStream = p.body.inputStream;
partFile.append(partStream);
}
}
File mhtFile = new File('<path>', 'Report-en-au.mht')
parseMhtToFile(mhtFile)
println 'Done.'
Parsing <path>\Report-en-au.mht...
BodyPart 0 text/html
Writing <path>\Report-en-au_0.html...
BodyPart 1 image/png
Writing <path>\Report-en-au_1.png...
Done.
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URL;
import java.util.Properties;
import javax.mail.BodyPart;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import org.apache.commons.io.IOUtils;
public class MhtParser {
private File mhtFile;
private File outputFolder;
public MhtParser(File mhtFile, File outputFolder) {
this.mhtFile = mhtFile;
this.outputFolder = outputFolder;
}
public void decompress() throws Exception {
MimeMessage message =
new MimeMessage(
Session.getDefaultInstance(new Properties(), null),
new FileInputStream(mhtFile));
if (message.getContent() instanceof MimeMultipart) {
outputFolder.mkdir();
MimeMultipart mimeMultipart = (MimeMultipart) message.getContent();
for (int i = 0; i < mimeMultipart.getCount(); i++) {
BodyPart bodyPart = mimeMultipart.getBodyPart(i);
String fileName = bodyPart.getFileName();
if (fileName == null) {
String[] locationHeader = bodyPart.getHeader("Content-Location");
if (locationHeader != null && locationHeader.length > 0) {
fileName =
new File(new URL(locationHeader[0]).getFile()).getName();
}
}
if (fileName != null) {
FileOutputStream out =
new FileOutputStream(new File(outputFolder, fileName));
IOUtils.copy(bodyPart.getInputStream(), out);
out.flush();
out.close();
}
}
}
}
}