Java 从字符串中删除HTML标记
有没有从Java字符串中删除HTML的好方法?一个简单的正则表达式Java 从字符串中删除HTML标记,java,html,regex,parsing,Java,Html,Regex,Parsing,有没有从Java字符串中删除HTML的好方法?一个简单的正则表达式 replaceAll(“\\”,“”) 将起作用,但像&无法正确转换,两个尖括号之间的非HTML将被删除(即正则表达式中的*?将消失)。如果用户输入嘿,是否要显示嘿或嘿?如果是第一个,转义小于s,并且html编码为符号(和可选的引号),那么就可以了。为实现第二个选项而对代码进行的修改如下: replaceAll("\\<[^>]*>","") replaceAll(“\\]*>”,“”) 但是,如果
replaceAll(“\\”,“”)
将起作用,但像&代码>无法正确转换,两个尖括号之间的非HTML将被删除(即正则表达式中的*?
将消失)。如果用户输入嘿代码>,是否要显示嘿代码>或嘿代码>?如果是第一个,转义小于s,并且html编码为符号(和可选的引号),那么就可以了。为实现第二个选项而对代码进行的修改如下:
replaceAll("\\<[^>]*>","")
replaceAll(“\\]*>”,“”)
但是,如果用户输入了一些格式不正确的内容,例如HTML转义确实很难正确执行,那么您将遇到问题-我绝对建议使用库代码来完成这项工作,因为它比您想象的要微妙得多。查看Apache以获得一个很好的Java库来处理这个问题。您可能希望在剥离HTML之前用换行符替换
和
标记,以防止它像Tim建议的那样变得难以辨认
我能想到的删除HTML标记但将非HTML保留在尖括号中的唯一方法是检查是否存在错误。沿着这条线的东西
replaceAll("\\<[\s]*tag[^>]*>","")
replaceAll(“\\]*>”,“”)
然后HTML解码特殊字符,如&代码>。不应将结果视为已清理。听起来您希望从HTML转换为纯文本。
如果是这样,请访问www.htmlparser.org。下面是一个从URL中找到的html文件中删除所有标记的示例。
它使用org.htmlparser.beans.StringBean
另一种方法是使用javax.swing.text.html.HTMLEditorKit提取文本
import java.io.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
public class Html2Text extends HTMLEditorKit.ParserCallback {
StringBuffer s;
public Html2Text() {
}
public void parse(Reader in) throws IOException {
s = new StringBuffer();
ParserDelegator delegator = new ParserDelegator();
// the third parameter is TRUE to ignore charset directive
delegator.parse(in, this, Boolean.TRUE);
}
public void handleText(char[] text, int pos) {
s.append(text);
}
public String getText() {
return s.toString();
}
public static void main(String[] args) {
try {
// the HTML to convert
FileReader in = new FileReader("java-new.html");
Html2Text parser = new Html2Text();
parser.parse(in);
in.close();
System.out.println(parser.getText());
} catch (Exception e) {
e.printStackTrace();
}
}
}
参考:这里有一个稍微充实的更新,尝试处理一些分隔符和列表的格式。我用阿玛亚的作品作为指导
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Stack;
import java.util.logging.Logger;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTML2Text extends HTMLEditorKit.ParserCallback {
private static final Logger log = Logger
.getLogger(Logger.GLOBAL_LOGGER_NAME);
private StringBuffer stringBuffer;
private Stack<IndexType> indentStack;
public static class IndexType {
public String type;
public int counter; // used for ordered lists
public IndexType(String type) {
this.type = type;
counter = 0;
}
}
public HTML2Text() {
stringBuffer = new StringBuffer();
indentStack = new Stack<IndexType>();
}
public static String convert(String html) {
HTML2Text parser = new HTML2Text();
Reader in = new StringReader(html);
try {
// the HTML to convert
parser.parse(in);
} catch (Exception e) {
log.severe(e.getMessage());
} finally {
try {
in.close();
} catch (IOException ioe) {
// this should never happen
}
}
return parser.getText();
}
public void parse(Reader in) throws IOException {
ParserDelegator delegator = new ParserDelegator();
// the third parameter is TRUE to ignore charset directive
delegator.parse(in, this, Boolean.TRUE);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
log.info("StartTag:" + t.toString());
if (t.toString().equals("p")) {
if (stringBuffer.length() > 0
&& !stringBuffer.substring(stringBuffer.length() - 1)
.equals("\n")) {
newLine();
}
newLine();
} else if (t.toString().equals("ol")) {
indentStack.push(new IndexType("ol"));
newLine();
} else if (t.toString().equals("ul")) {
indentStack.push(new IndexType("ul"));
newLine();
} else if (t.toString().equals("li")) {
IndexType parent = indentStack.peek();
if (parent.type.equals("ol")) {
String numberString = "" + (++parent.counter) + ".";
stringBuffer.append(numberString);
for (int i = 0; i < (4 - numberString.length()); i++) {
stringBuffer.append(" ");
}
} else {
stringBuffer.append("* ");
}
indentStack.push(new IndexType("li"));
} else if (t.toString().equals("dl")) {
newLine();
} else if (t.toString().equals("dt")) {
newLine();
} else if (t.toString().equals("dd")) {
indentStack.push(new IndexType("dd"));
newLine();
}
}
private void newLine() {
stringBuffer.append("\n");
for (int i = 0; i < indentStack.size(); i++) {
stringBuffer.append(" ");
}
}
public void handleEndTag(HTML.Tag t, int pos) {
log.info("EndTag:" + t.toString());
if (t.toString().equals("p")) {
newLine();
} else if (t.toString().equals("ol")) {
indentStack.pop();
;
newLine();
} else if (t.toString().equals("ul")) {
indentStack.pop();
;
newLine();
} else if (t.toString().equals("li")) {
indentStack.pop();
;
newLine();
} else if (t.toString().equals("dd")) {
indentStack.pop();
;
}
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
log.info("SimpleTag:" + t.toString());
if (t.toString().equals("br")) {
newLine();
}
}
public void handleText(char[] text, int pos) {
log.info("Text:" + new String(text));
stringBuffer.append(text);
}
public String getText() {
return stringBuffer.toString();
}
public static void main(String args[]) {
String html = "<html><body><p>paragraph at start</p>hello<br />What is happening?<p>this is a<br />mutiline paragraph</p><ol> <li>This</li> <li>is</li> <li>an</li> <li>ordered</li> <li>list <p>with</p> <ul> <li>another</li> <li>list <dl> <dt>This</dt> <dt>is</dt> <dd>sdasd</dd> <dd>sdasda</dd> <dd>asda <p>aasdas</p> </dd> <dd>sdada</dd> <dt>fsdfsdfsd</dt> </dl> <dl> <dt>vbcvcvbcvb</dt> <dt>cvbcvbc</dt> <dd>vbcbcvbcvb</dd> <dt>cvbcv</dt> <dt></dt> </dl> <dl> <dt></dt> </dl></li> <li>cool</li> </ul> <p>stuff</p> </li> <li>cool</li></ol><p></p></body></html>";
System.out.println(convert(html));
}
}
import java.io.IOException;
导入java.io.Reader;
导入java.io.StringReader;
导入java.util.Stack;
导入java.util.logging.Logger;
导入javax.swing.text.MutableAttributeSet;
导入javax.swing.text.html.html;
导入javax.swing.text.html.HTMLEditorKit;
导入javax.swing.text.html.parser.ParserDelegator;
公共类HTML2Text扩展了HTMLEditorKit.ParserCallback{
专用静态最终记录器日志=记录器
.getLogger(Logger.GLOBAL_Logger_NAME);
私有字符串缓冲区字符串缓冲区;
私有堆栈缩进堆栈;
公共静态类索引类型{
公共字符串类型;
public int counter;//用于有序列表
公共索引类型(字符串类型){
this.type=type;
计数器=0;
}
}
公共HTML2Text(){
stringBuffer=新的stringBuffer();
indentStack=新堆栈();
}
公共静态字符串转换(字符串html){
HTML2Text解析器=新的HTML2Text();
Reader in=新的StringReader(html);
试一试{
//要转换的HTML
parser.parse(in);
}捕获(例外e){
log.severy(如getMessage());
}最后{
试一试{
in.close();
}捕获(ioe异常ioe){
//这永远不应该发生
}
}
返回parser.getText();
}
公共void解析(读取器in)引发IOException{
ParserDelegator delegator=新的ParserDelegator();
//第三个参数为TRUE以忽略字符集指令
parse(in,this,Boolean.TRUE);
}
public void handleStartTag(HTML.Tag t、MutableAttributeSet a、int pos){
log.info(“StartTag:+t.toString());
如果(t.toString().equals(“p”)){
如果(stringBuffer.length()>0
&&!stringBuffer.substring(stringBuffer.length()-1)
.equals(“\n”)){
换行符();
}
换行符();
}else如果(t.toString().equals(“ol”)){
push(新的索引类型(“ol”);
换行符();
}else如果(t.toString().equals(“ul”)){
push(新索引类型(“ul”);
换行符();
}else如果(t.toString()等于(“li”)){
IndexType parent=indentStack.peek();
if(parent.type.equals(“ol”)){
字符串编号String=“”+(++parent.counter)+“;
追加(numberString);
对于(int i=0;i<(4-numberString.length());i++){
stringBuffer.append(“”);
}
}否则{
stringBuffer.append(“*”);
}
push(新的索引类型(“li”);
}else如果(t.toString().equals(“dl”)){
换行符();
}否则,如果(t.toString().equals(“dt”)){
换行符();
}else如果(t.toString().equals(“dd”)){
push(新的索引类型(“dd”);
换行符();
}
}
私有无效换行符(){
stringBuffer.append(“\n”);
对于(int i=0;iimport java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Stack;
import java.util.logging.Logger;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTML2Text extends HTMLEditorKit.ParserCallback {
private static final Logger log = Logger
.getLogger(Logger.GLOBAL_LOGGER_NAME);
private StringBuffer stringBuffer;
private Stack<IndexType> indentStack;
public static class IndexType {
public String type;
public int counter; // used for ordered lists
public IndexType(String type) {
this.type = type;
counter = 0;
}
}
public HTML2Text() {
stringBuffer = new StringBuffer();
indentStack = new Stack<IndexType>();
}
public static String convert(String html) {
HTML2Text parser = new HTML2Text();
Reader in = new StringReader(html);
try {
// the HTML to convert
parser.parse(in);
} catch (Exception e) {
log.severe(e.getMessage());
} finally {
try {
in.close();
} catch (IOException ioe) {
// this should never happen
}
}
return parser.getText();
}
public void parse(Reader in) throws IOException {
ParserDelegator delegator = new ParserDelegator();
// the third parameter is TRUE to ignore charset directive
delegator.parse(in, this, Boolean.TRUE);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
log.info("StartTag:" + t.toString());
if (t.toString().equals("p")) {
if (stringBuffer.length() > 0
&& !stringBuffer.substring(stringBuffer.length() - 1)
.equals("\n")) {
newLine();
}
newLine();
} else if (t.toString().equals("ol")) {
indentStack.push(new IndexType("ol"));
newLine();
} else if (t.toString().equals("ul")) {
indentStack.push(new IndexType("ul"));
newLine();
} else if (t.toString().equals("li")) {
IndexType parent = indentStack.peek();
if (parent.type.equals("ol")) {
String numberString = "" + (++parent.counter) + ".";
stringBuffer.append(numberString);
for (int i = 0; i < (4 - numberString.length()); i++) {
stringBuffer.append(" ");
}
} else {
stringBuffer.append("* ");
}
indentStack.push(new IndexType("li"));
} else if (t.toString().equals("dl")) {
newLine();
} else if (t.toString().equals("dt")) {
newLine();
} else if (t.toString().equals("dd")) {
indentStack.push(new IndexType("dd"));
newLine();
}
}
private void newLine() {
stringBuffer.append("\n");
for (int i = 0; i < indentStack.size(); i++) {
stringBuffer.append(" ");
}
}
public void handleEndTag(HTML.Tag t, int pos) {
log.info("EndTag:" + t.toString());
if (t.toString().equals("p")) {
newLine();
} else if (t.toString().equals("ol")) {
indentStack.pop();
;
newLine();
} else if (t.toString().equals("ul")) {
indentStack.pop();
;
newLine();
} else if (t.toString().equals("li")) {
indentStack.pop();
;
newLine();
} else if (t.toString().equals("dd")) {
indentStack.pop();
;
}
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
log.info("SimpleTag:" + t.toString());
if (t.toString().equals("br")) {
newLine();
}
}
public void handleText(char[] text, int pos) {
log.info("Text:" + new String(text));
stringBuffer.append(text);
}
public String getText() {
return stringBuffer.toString();
}
public static void main(String args[]) {
String html = "<html><body><p>paragraph at start</p>hello<br />What is happening?<p>this is a<br />mutiline paragraph</p><ol> <li>This</li> <li>is</li> <li>an</li> <li>ordered</li> <li>list <p>with</p> <ul> <li>another</li> <li>list <dl> <dt>This</dt> <dt>is</dt> <dd>sdasd</dd> <dd>sdasda</dd> <dd>asda <p>aasdas</p> </dd> <dd>sdada</dd> <dt>fsdfsdfsd</dt> </dl> <dl> <dt>vbcvcvbcvb</dt> <dt>cvbcvbc</dt> <dd>vbcbcvbcvb</dd> <dt>cvbcv</dt> <dt></dt> </dl> <dl> <dt></dt> </dl></li> <li>cool</li> </ul> <p>stuff</p> </li> <li>cool</li></ol><p></p></body></html>";
System.out.println(convert(html));
}
}
public static String html2text(String html) {
return Jsoup.parse(html).text();
}
MyWriter.toConsole(HtmlToText.htmlToPlainText(htmlResponse));
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Logger;
import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Take HTML and give back the text part while dropping the HTML tags.
*
* There is some risk that using TagSoup means we'll permute non-HTML text.
* However, it seems to work the best so far in test cases.
*
* @author dan
* @see <a href="http://home.ccil.org/~cowan/XML/tagsoup/">TagSoup</a>
*/
public class Html2Text2 implements ContentHandler {
private StringBuffer sb;
public Html2Text2() {
}
public void parse(String str) throws IOException, SAXException {
XMLReader reader = new Parser();
reader.setContentHandler(this);
sb = new StringBuffer();
reader.parse(new InputSource(new StringReader(str)));
}
public String getText() {
return sb.toString();
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
for (int idx = 0; idx < length; idx++) {
sb.append(ch[idx+start]);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
sb.append(ch);
}
// The methods below do not contribute to the text
@Override
public void endDocument() throws SAXException {
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
}
@Override
public void processingInstruction(String target, String data)
throws SAXException {
}
@Override
public void setDocumentLocator(Locator locator) {
}
@Override
public void skippedEntity(String name) throws SAXException {
}
@Override
public void startDocument() throws SAXException {
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
}
}
private static final Pattern REMOVE_TAGS = Pattern.compile("<.+?>");
public static String removeTags(String string) {
if (string == null || string.length() == 0) {
return string;
}
Matcher m = REMOVE_TAGS.matcher(string);
return m.replaceAll("");
}
noHTMLString.replaceAll("\\&.*?\\;", "");
html = html.replaceAll(" ","");
html = html.replaceAll("&"."");
android.text.HtmlCompat.fromHtml(instruction, HtmlCompat.FROM_HTML_MODE_LEGACY).toString()
Source htmlSource = new Source(htmlText);
Segment htmlSeg = new Segment(htmlSource, 0, htmlSource.length());
Renderer htmlRend = new Renderer(htmlSeg);
System.out.println(htmlRend.toString());
String[] temp = yourString.split("&");
String tmp = "";
if (temp.length > 1) {
for (int i = 0; i < temp.length; i++) {
tmp += temp[i] + "&";
}
yourString = tmp.substring(0, tmp.length() - 1);
}
public static String removeHTML(String input) {
int i = 0;
String[] str = input.split("");
String s = "";
boolean inTag = false;
for (i = input.indexOf("<"); i < input.indexOf(">"); i++) {
inTag = true;
}
if (!inTag) {
for (i = 0; i < str.length; i++) {
s = s + str[i];
}
}
return s;
}
String BR_ESCAPED = "<br/>";
Element el=Jsoup.parse(html).select("body");
el.select("br").append(BR_ESCAPED);
el.select("p").append(BR_ESCAPED+BR_ESCAPED);
el.select("h1").append(BR_ESCAPED+BR_ESCAPED);
el.select("h2").append(BR_ESCAPED+BR_ESCAPED);
el.select("h3").append(BR_ESCAPED+BR_ESCAPED);
el.select("h4").append(BR_ESCAPED+BR_ESCAPED);
el.select("h5").append(BR_ESCAPED+BR_ESCAPED);
String nodeValue=el.text();
nodeValue=nodeValue.replaceAll(BR_ESCAPED, "<br/>");
nodeValue=nodeValue.replaceAll("(\\s*<br[^>]*>){3,}", "<br/><br/>");
nodeValue=nodeValue.replaceAll("(\\s*\n){3,}", "<br/><br/>");
private CharSequence removeHtmlFrom(String html) {
return new HtmlCleaner().clean(html).getText();
}
// breaks multi-level of escaping, preventing &lt;script&gt; to be rendered as <script>
String replace = input.replace("&", "");
// decode any encoded html, preventing <script> to be rendered as <script>
String html = StringEscapeUtils.unescapeHtml(replace);
// remove all html tags, but maintain line breaks
String clean = Jsoup.clean(html, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
// decode html again to convert character entities back into text
return StringEscapeUtils.unescapeHtml(clean);
{"regular string", "regular string"},
{"<a href=\"link\">A link</a>", "A link"},
{"<script src=\"http://evil.url.com\"/>", ""},
{"<script>", ""},
{"&lt;script&gt;", "lt;scriptgt;"}, // best effort
{"\" ' > < \n \\ é å à ü and & preserved", "\" ' > < \n \\ é å à ü and & preserved"}
String result = Html.fromHtml(html).toString();
String html = "<p>Line one</p><p>Line two</p>Line three<br/>etc.";
String NEW_LINE_MARK = "NEWLINESTART1234567890NEWLINEEND";
for (String tag: new String[]{"</p>","<br/>","</h1>","</h2>","</h3>","</h4>","</h5>","</h6>","</li>"}) {
html = html.replace(tag, NEW_LINE_MARK+tag);
}
String text = Jsoup.parse(html).text();
text = text.replace(NEW_LINE_MARK + " ", "\n\n");
text = text.replace(NEW_LINE_MARK, "\n\n");
<a href=”…”> <b>, <big>, <blockquote>, <br>, <cite>, <dfn>
<div align=”…”>, <em>, <font size=”…” color=”…” face=”…”>
<h1>, <h2>, <h3>, <h4>, <h5>, <h6>
<i>, <p>, <small>
<strike>, <strong>, <sub>, <sup>, <tt>, <u>
String Str_Html=" <p>This is about me text that the user can put into their profile</p> ";
Your_TextView_Obj.setText(Html.fromHtml(Str_Html).toString());
text.replaceAll('<.*?>' , " ") -> This will replace all the html tags with a space.
text.replaceAll('&.*?;' , "")-> this will replace all the tags which starts with "&" and ends with ";" like , &, > etc.
classeString.replaceAll("\\<(/?[^\\>]+)\\>", "\\ ").replaceAll("\\s+", " ").trim()
public String htmlToStringFilter(String textToFilter){
return Html.fromHtml(textToFilter).toString();
}
Pattern REMOVE_TAGS = Pattern.compile("<.+?>");
Source source= new Source(htmlAsString);
Matcher m = REMOVE_TAGS.matcher(sourceStep.getTextExtractor().toString());
String clearedHtml= m.replaceAll("");
using ServiceStack.Text;
// ...
"The <b>quick</b> brown <p> fox </p> jumps over the lazy dog".StripHtml();
// delete all comments
response = response.replaceAll("<!--[^>]*-->", "");
// delete all script elements
response = response.replaceAll("<(script|SCRIPT)[^+]*?>[^>]*?<(/script|SCRIPT)>", "");
Document doc = Jsoup.parse(htmlstrl);
Whitelist wl = Whitelist.none();
String plain = Jsoup.clean(doc.text(), wl);