如何在java中读取或解析MHTML（.mht）文件_Java_Parsing_Compression_Mhtml

如何在java中读取或解析MHTML（.mht）文件

java parsing compression

如何在java中读取或解析MHTML（.mht）文件,java,parsing,compression,mhtml,Java,Parsing,Compression,Mhtml,我需要挖掘大多数已知文档文件的内容，如： pdf html doc/docx等对于我计划使用的大多数文件格式：但是到目前为止，Tika不支持MHTML（*.mht）文件。。( ) C#（）中的示例很少，但我在Java中没有找到我尝试用7Zip打开*.mht文件，但失败了…尽管WinZip能够将文件解压缩为图像和文本（CSS、HTML、脚本）作为文本和二进制文件根据MSDN第页（）和我前面提到的代码项目第页。。。mht文件使用GZip压缩尝试在java中解压缩会导致以下异常：使用j

我需要挖掘大多数已知文档文件的内容，如：

pdf

html

doc/docx等
对于我计划使用的大多数文件格式：

但是到目前为止，
Tika
不支持MHTML（*.mht）文件。。( ) C#（）中的示例很少，但我在Java中没有找到
我尝试用7Zip打开*.mht文件，但失败了…尽管WinZip能够将文件解压缩为图像和文本（CSS、HTML、脚本）作为文本和二进制文件
根据MSDN第页（）和我前面提到的
代码项目
第页。。。mht文件使用GZip压缩
尝试在java中解压缩会导致以下异常：使用
java.uti.zip.gzip输入流

java.io.IOException: Not in GZIP format at java.util.zip.GZIPInputStream.readHeader(Unknown Source) at java.util.zip.GZIPInputStream.<init>(Unknown Source) at java.util.zip.GZIPInputStream.<init>(Unknown Source) at GZipTest.main(GZipTest.java:16)
请建议如何解压

谢谢……
你可以试试，它可以打包/解包，你可以像处理普通文件一样处理它。下载链接是：
我被用来解析/读取/索引mht文件（但作为普通文件，而不是压缩文件）
坦白地说，我不希望在不久的将来找到解决方案，我打算放弃，但我是如何在这一页上绊倒的：

虽然，第一眼看上去不是很吸引人。但如果你仔细看，你会得到线索。读完这篇文章后，我启动了IE，随机开始将页面保存为
*.mht
文件。让我一行一行去
但是让我事先解释一下，我的最终目标是分离出
html
内容并对其进行解析。。。解决方案本身并不完整，因为它取决于保存时选择的
字符集
或
编码。但即使它会提取带有小故障的单个文件我希望这对任何试图解析/解压*.mht/MHTML 文件的人都有用：） =======说明======== **取自mht文件** From: "Saved by Windows Internet Explorer 7" 它是用于保存文件的软件 Subject: Google Date: Tue, 13 Jul 2010 21:23:03 +0530 MIME-Version: 1.0 主题、日期和mime版本…与邮件格式非常相似 Content-Type: multipart/related; type="text/html"; 这部分告诉我们它是一个多部分文档。多部分文档在单个正文中组合了一组或多组不同的数据，实体的标题中必须出现multipart 内容类型字段。在这里，我们还可以看到类型为“text/html” 这其中最重要的部分。这是将两个不同部分（html、图像、css、脚本等）分开的唯一分隔符一旦你掌握了这个，一切都会变得轻松。。。现在，我只需遍历文档，找出不同的部分，并根据它们的内容传输编码保存它们（base64，可打印引用等）。。。 . . 样本 ------=_NextPart_000_0007_01CB22D1.93BBD1A0 Content-Type: text/html; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Location: http://www.google.com/webhp?sourceid=navclient&ie=UTF-8 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" = . . . 主解析器类 /** * This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html */ package com.test.mht.core; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.OutputStreamWriter; import java.util.regex.Matcher; import java.util.regex.Pattern; import sun.misc.BASE64Decoder; /** * File to parse and decompose *.mts file in its constituting parts. * @author Manish Shukla */ public class MHTParser implements IConstants { private File mhtFile; private File outputFolder; public MHTParser(File mhtFile, File outputFolder) { this.mhtFile = mhtFile; this.outputFolder = outputFolder; } /** * @throws Exception */ public void decompress() throws Exception { BufferedReader reader = null; String type = ""; String encoding = ""; String location = ""; String filename = ""; String charset = "utf-8"; StringBuilder buffer = null; try { reader = new BufferedReader(new FileReader(mhtFile)); final String boundary = getBoundary(reader); if(boundary == null) throw new Exception("Failed to find document 'boundary'... Aborting"); String line = null; int i = 1; while((line = reader.readLine()) != null) { String temp = line.trim(); if(temp.contains(boundary)) { if(buffer != null) { writeBufferContentToFile(buffer,encoding,filename,charset); buffer = null; } buffer = new StringBuilder(); }else if(temp.startsWith(CONTENT_TYPE)) { type = getType(temp); }else if(temp.startsWith(CHAR_SET)) { charset = getCharSet(temp); }else if(temp.startsWith(CONTENT_TRANSFER_ENCODING)) { encoding = getEncoding(temp); }else if(temp.startsWith(CONTENT_LOCATION)) { location = temp.substring(temp.indexOf(":")+1).trim(); i++; filename = getFileName(location,type); }else { if(buffer != null) { buffer.append(line + "\n"); } } } }finally { if(null != reader) reader.close(); } } private String getCharSet(String temp) { String t = temp.split("=")[1].trim(); return t.substring(1, t.length()-1); } /** * Save the file as per character set and encoding */ private void writeBufferContentToFile(StringBuilder buffer,String encoding, String filename, String charset) throws Exception { if(!outputFolder.exists()) outputFolder.mkdirs(); byte[] content = null; boolean text = true; if(encoding.equalsIgnoreCase("base64")){ content = getBase64EncodedString(buffer); text = false; }else if(encoding.equalsIgnoreCase("quoted-printable")) { content = getQuotedPrintableString(buffer); } else content = buffer.toString().getBytes(); if(!text) { BufferedOutputStream bos = null; try { bos = new BufferedOutputStream(new FileOutputStream(filename)); bos.write(content); bos.flush(); }finally { bos.close(); } }else { BufferedWriter bw = null; try { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), charset)); bw.write(new String(content)); bw.flush(); }finally { bw.close(); } } } /** * When the save the *.mts file with 'utf-8' encoding then it appends '=EF=BB=BF'</br> * @see http://en.wikipedia.org/wiki/Byte_order_mark */ private byte[] getQuotedPrintableString(StringBuilder buffer) { //Set<String> uniqueHex = new HashSet<String>(); //final Pattern p = Pattern.compile("(=\\p{XDigit}{2})*"); String temp = buffer.toString().replaceAll(UTF8_BOM, "").replaceAll("=\n", ""); //Matcher m = p.matcher(temp); //while(m.find()) { // uniqueHex.add(m.group()); //} //System.out.println(uniqueHex); //for (String hex : uniqueHex) { //temp = temp.replaceAll(hex, getASCIIValue(hex.substring(1))); //} return temp.getBytes(); } /*private String getASCIIValue(String hex) { return ""+(char)Integer.parseInt(hex, 16); }*/ /** * Although system dependent..it works well */ private byte[] getBase64EncodedString(StringBuilder buffer) throws Exception { return new BASE64Decoder().decodeBuffer(buffer.toString()); } /** * Tries to get a qualified file name. If the name is not apparent it tries to guess it from the URL. * Otherwise it returns 'unknown.<type>' */ private String getFileName(String location, String type) { final Pattern p = Pattern.compile("(\\w|_|-)+\\.\\w+"); String ext = ""; String name = ""; if(type.toLowerCase().endsWith("jpeg")) ext = "jpg"; else ext = type.split("/")[1]; if(location.endsWith("/")) { name = "main"; }else { name = location.substring(location.lastIndexOf("/") + 1); Matcher m = p.matcher(name); String fname = ""; while(m.find()) { fname = m.group(); } if(fname.trim().length() == 0) name = "unknown"; else return getUniqueName(fname.substring(0,fname.indexOf(".")), fname.substring(fname.indexOf(".") + 1, fname.length())); } return getUniqueName(name,ext); } /** * Returns a qualified unique output file path for the parsed path.</br> * In case the file already exist it appends a numarical value a continues */ private String getUniqueName(String name,String ext) { int i = 1; File file = new File(outputFolder,name + "." + ext); if(file.exists()) { while(true) { file = new File(outputFolder, name + i + "." + ext); if(!file.exists()) return file.getAbsolutePath(); i++; } } return file.getAbsolutePath(); } private String getType(String line) { return splitUsingColonSpace(line); } private String getEncoding(String line){ return splitUsingColonSpace(line); } private String splitUsingColonSpace(String line) { return line.split(":\\s*")[1].replaceAll(";", ""); } /** * Gives you the boundary string */ private String getBoundary(BufferedReader reader) throws Exception { String line = null; while((line = reader.readLine()) != null) { line = line.trim(); if(line.startsWith(BOUNDARY)) { return line.substring(line.indexOf("\"") + 1, line.lastIndexOf("\"")); } } return null; } } /** *本程序及其附带的资料是根据Eclipse公共许可证v1.0的条款提供的 *随本发行版发行，可在 * http://www.eclipse.org/legal/epl-v10.html */ 包com.test.mht.core；导入java.io.BufferedOutputStream；导入java.io.BufferedReader；导入java.io.BufferedWriter；导入java.io.File；导入java.io.FileOutputStream；导入java.io.FileReader；导入java.io.OutputStreamWriter；导入java.util.regex.Matcher；导入java.util.regex.Pattern；导入sun.misc.base64解码器； /** *要在其组成部分中解析和分解*.mts文件的文件。 *@作者Manish Shukla */ 公共类MHTParser实现IConstants { 私有文件mhtFile；私有文件输出文件夹；公共MHTParser（文件mhtFile、文件输出文件夹）{ this.mhtFile=mhtFile； this.outputFolder=outputFolder； } /** *@抛出异常 */ public void decompress（）引发异常 { BufferedReader reader=null；字符串类型=”；字符串编码=”；字符串位置=”；字符串filename=“”；字符串charset=“utf-8”； StringBuilder缓冲区=null；尝试 { reader=newbufferedreader（newfilereader（mhtFile））；最终字符串边界=getBoundary（读卡器）；如果（边界==null）抛出新异常（“未能找到文档“边界”…正在中止”）；字符串行=null； int i=1；而（（line=reader.readLine（））！=null） { 字符串温度=line.trim（）；如果（温度包含（边界）） { if（缓冲区！=null）{ writeBufferContentToFile（缓冲区、编码、文件名、字符集）；缓冲区=空； } 缓冲区=新的StringBuilder（）； }else if（临时启动（内容类型））{ type=getType（temp）； }else if（临时启动（字符集））{ charset=getCharSet（temp）； }else if（临时启动（内容\传输\编码））{ 编码=getEncoding（临时）； }else if（临时启动（内容\位置））{ location=temp.substring（temp.indexOf（“：”）+1.trim（）； i++； filename=getFileName（位置、类型）； }否则{ if（缓冲区！=null）{ buffer.append（第+行“\n”）； } } } }最后 { if（null！=读取器） reader.close（）； } } 私有字符串getCharSet（字符串临时值） { 字符串t=temp.split（“=”[1]。trim（）；返回t.substring（1，t.length（）-1）； } /** *按照字符集和编码保存文件 */ 私有void writeBufferContentToFile（StringBuilder缓冲区、字符串编码、字符串文件名、字符串字符 ------=_NextPart_000_0007_01CB22D1.93BBD1A0 Content-Type: text/html; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Location: http://www.google.com/webhp?sourceid=navclient&ie=UTF-8 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" = . . . public interface IConstants { public String BOUNDARY = "boundary"; public String CHAR_SET = "charset"; public String CONTENT_TYPE = "Content-Type"; public String CONTENT_TRANSFER_ENCODING = "Content-Transfer-Encoding"; public String CONTENT_LOCATION = "Content-Location"; public String UTF8_BOM = "=EF=BB=BF"; public String UTF16_BOM1 = "=FF=FE"; public String UTF16_BOM2 = "=FE=FF"; } /** * This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html */ package com.test.mht.core; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.OutputStreamWriter; import java.util.regex.Matcher; import java.util.regex.Pattern; import sun.misc.BASE64Decoder; /** * File to parse and decompose *.mts file in its constituting parts. * @author Manish Shukla */ public class MHTParser implements IConstants { private File mhtFile; private File outputFolder; public MHTParser(File mhtFile, File outputFolder) { this.mhtFile = mhtFile; this.outputFolder = outputFolder; } /** * @throws Exception */ public void decompress() throws Exception { BufferedReader reader = null; String type = ""; String encoding = ""; String location = ""; String filename = ""; String charset = "utf-8"; StringBuilder buffer = null; try { reader = new BufferedReader(new FileReader(mhtFile)); final String boundary = getBoundary(reader); if(boundary == null) throw new Exception("Failed to find document 'boundary'... Aborting"); String line = null; int i = 1; while((line = reader.readLine()) != null) { String temp = line.trim(); if(temp.contains(boundary)) { if(buffer != null) { writeBufferContentToFile(buffer,encoding,filename,charset); buffer = null; } buffer = new StringBuilder(); }else if(temp.startsWith(CONTENT_TYPE)) { type = getType(temp); }else if(temp.startsWith(CHAR_SET)) { charset = getCharSet(temp); }else if(temp.startsWith(CONTENT_TRANSFER_ENCODING)) { encoding = getEncoding(temp); }else if(temp.startsWith(CONTENT_LOCATION)) { location = temp.substring(temp.indexOf(":")+1).trim(); i++; filename = getFileName(location,type); }else { if(buffer != null) { buffer.append(line + "\n"); } } } }finally { if(null != reader) reader.close(); } } private String getCharSet(String temp) { String t = temp.split("=")[1].trim(); return t.substring(1, t.length()-1); } /** * Save the file as per character set and encoding */ private void writeBufferContentToFile(StringBuilder buffer,String encoding, String filename, String charset) throws Exception { if(!outputFolder.exists()) outputFolder.mkdirs(); byte[] content = null; boolean text = true; if(encoding.equalsIgnoreCase("base64")){ content = getBase64EncodedString(buffer); text = false; }else if(encoding.equalsIgnoreCase("quoted-printable")) { content = getQuotedPrintableString(buffer); } else content = buffer.toString().getBytes(); if(!text) { BufferedOutputStream bos = null; try { bos = new BufferedOutputStream(new FileOutputStream(filename)); bos.write(content); bos.flush(); }finally { bos.close(); } }else { BufferedWriter bw = null; try { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), charset)); bw.write(new String(content)); bw.flush(); }finally { bw.close(); } } } /** * When the save the *.mts file with 'utf-8' encoding then it appends '=EF=BB=BF'</br> * @see http://en.wikipedia.org/wiki/Byte_order_mark */ private byte[] getQuotedPrintableString(StringBuilder buffer) { //Set<String> uniqueHex = new HashSet<String>(); //final Pattern p = Pattern.compile("(=\\p{XDigit}{2})*"); String temp = buffer.toString().replaceAll(UTF8_BOM, "").replaceAll("=\n", ""); //Matcher m = p.matcher(temp); //while(m.find()) { // uniqueHex.add(m.group()); //} //System.out.println(uniqueHex); //for (String hex : uniqueHex) { //temp = temp.replaceAll(hex, getASCIIValue(hex.substring(1))); //} return temp.getBytes(); } /*private String getASCIIValue(String hex) { return ""+(char)Integer.parseInt(hex, 16); }*/ /** * Although system dependent..it works well */ private byte[] getBase64EncodedString(StringBuilder buffer) throws Exception { return new BASE64Decoder().decodeBuffer(buffer.toString()); } /** * Tries to get a qualified file name. If the name is not apparent it tries to guess it from the URL. * Otherwise it returns 'unknown.<type>' */ private String getFileName(String location, String type) { final Pattern p = Pattern.compile("(\\w|_|-)+\\.\\w+"); String ext = ""; String name = ""; if(type.toLowerCase().endsWith("jpeg")) ext = "jpg"; else ext = type.split("/")[1]; if(location.endsWith("/")) { name = "main"; }else { name = location.substring(location.lastIndexOf("/") + 1); Matcher m = p.matcher(name); String fname = ""; while(m.find()) { fname = m.group(); } if(fname.trim().length() == 0) name = "unknown"; else return getUniqueName(fname.substring(0,fname.indexOf(".")), fname.substring(fname.indexOf(".") + 1, fname.length())); } return getUniqueName(name,ext); } /** * Returns a qualified unique output file path for the parsed path.</br> * In case the file already exist it appends a numarical value a continues */ private String getUniqueName(String name,String ext) { int i = 1; File file = new File(outputFolder,name + "." + ext); if(file.exists()) { while(true) { file = new File(outputFolder, name + i + "." + ext); if(!file.exists()) return file.getAbsolutePath(); i++; } } return file.getAbsolutePath(); } private String getType(String line) { return splitUsingColonSpace(line); } private String getEncoding(String line){ return splitUsingColonSpace(line); } private String splitUsingColonSpace(String line) { return line.split(":\\s*")[1].replaceAll(";", ""); } /** * Gives you the boundary string */ private String getBoundary(BufferedReader reader) throws Exception { String line = null; while((line = reader.readLine()) != null) { line = line.trim(); if(line.startsWith(BOUNDARY)) { return line.substring(line.indexOf("\"") + 1, line.lastIndexOf("\"")); } } return null; } } <dependency> <groupId>org.apache.james</groupId> <artifactId>apache-mime4j</artifactId> <version>0.7.2</version> </dependency> public static void main(String[] args) { MessageTree.main(new String[]{"YOU MHT FILE PATH"}); } /** * Displays a parsed Message in a window. The window will be divided into * two panels. The left panel displays the Message tree. Clicking on a * node in the tree shows information on that node in the right panel. * * Some of this code have been copied from the Java tutorial's JTree section. */ import org.apache.james.mime4j.dom.Message import org.apache.james.mime4j.dom.Multipart import org.apache.james.mime4j.dom.field.ContentTypeField import org.apache.james.mime4j.message.DefaultMessageBuilder import org.apache.james.mime4j.stream.MimeConfig /** * Use Mime4J MessageBuilder to parse an mhtml file (assumes multipart) into * separate html files. * Files will be written to outDir (or parent) as baseName + partIdx + ext. */ void parseMhtToFile(File mhtFile, File outDir = null) { if (!outDir) {outDir = mhtFile.parentFile } // File baseName will be used in generating new filenames def mhtBaseName = mhtFile.name.replaceFirst(~/\.[^\.]+$/, '') // -- Set up Mime parser, using Default Message Builder MimeConfig parserConfig = new MimeConfig(); parserConfig.setMaxHeaderLen(-1); // The default is a mere 10k parserConfig.setMaxLineLen(-1); // The default is only 1000 characters. parserConfig.setMaxHeaderCount(-1); // Disable the check for header count. DefaultMessageBuilder builder = new DefaultMessageBuilder(); builder.setMimeEntityConfig(parserConfig); // -- Parse the MHT stream data into a Message object println "Parsing ${mhtFile}..."; InputStream mhtStream = mhtFile.newInputStream() Message message = builder.parseMessage(mhtStream); // -- Process the resulting body parts, writing to file assert message.getBody() instanceof Multipart Multipart multipart = (Multipart) message.getBody(); def parts = multipart.getBodyParts(); parts.eachWithIndex { p, i -> ContentTypeField cType = p.header.getField('content-type') println "${p.class.simpleName}\t${i}\t${cType.mimeType}" // Assume mime sub-type is a "good enough" file-name extension // e.g. text/html = html, image/png = png, application/json = json String partFileName = "${mhtBaseName}_${i}.${cType.subType}" File partFile = new File(outDir, partFileName) // Write part body stream to file println "Writing ${partFile}..."; if (partFile.exists()) partFile.delete(); InputStream partStream = p.body.inputStream; partFile.append(partStream); } } File mhtFile = new File('<path>', 'Report-en-au.mht') parseMhtToFile(mhtFile) println 'Done.' Parsing <path>\Report-en-au.mht... BodyPart 0 text/html Writing <path>\Report-en-au_0.html... BodyPart 1 image/png Writing <path>\Report-en-au_1.png... Done. import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.net.URL; import java.util.Properties; import javax.mail.BodyPart; import javax.mail.Session; import javax.mail.internet.MimeMessage; import javax.mail.internet.MimeMultipart; import org.apache.commons.io.IOUtils; public class MhtParser { private File mhtFile; private File outputFolder; public MhtParser(File mhtFile, File outputFolder) { this.mhtFile = mhtFile; this.outputFolder = outputFolder; } public void decompress() throws Exception { MimeMessage message = new MimeMessage( Session.getDefaultInstance(new Properties(), null), new FileInputStream(mhtFile)); if (message.getContent() instanceof MimeMultipart) { outputFolder.mkdir(); MimeMultipart mimeMultipart = (MimeMultipart) message.getContent(); for (int i = 0; i < mimeMultipart.getCount(); i++) { BodyPart bodyPart = mimeMultipart.getBodyPart(i); String fileName = bodyPart.getFileName(); if (fileName == null) { String[] locationHeader = bodyPart.getHeader("Content-Location"); if (locationHeader != null && locationHeader.length > 0) { fileName = new File(new URL(locationHeader[0]).getFile()).getName(); } } if (fileName != null) { FileOutputStream out = new FileOutputStream(new File(outputFolder, fileName)); IOUtils.copy(bodyPart.getInputStream(), out); out.flush(); out.close(); } } } } }