Java me j2me中无任何HTML标记和特殊字符的HTML文本提取

Java me j2me中无任何HTML标记和特殊字符的HTML文本提取,java-me,lwuit,Java Me,Lwuit,解析后,我有一个RSS文件中的字符串 字符串htmlString= <p><img border="1" align="left" width="200" vspace="2" hspace="2" height="133" alt="Prime Minister Manmohan Singh will leave for Iran on August 28, 2012 to attend the Non-Aligned Movement summit, which wil

解析后,我有一个RSS文件中的字符串

字符串htmlString=

<p><img border="1" align="left" width="200" vspace="2" hspace="2" height="133" alt="Prime Minister Manmohan Singh will leave for Iran on August   28, 2012 to attend the Non-Aligned Movement summit, which will   be preceded by crucial bilateral talks with Iran&rsquo;s supreme   leader Ayotollah Ali Khamenei and Iranian President Mahmoud   Ahmadinejad." src="/tmdbuserfiles/manmohan ahmadi(3).jpg" />Prime Minister summit, which will be preceded by crucial bilateral talks with Iran&rsquo;s supreme leader place at a time when the U.S. is pushing India to reduce engagement with Iran and implement sanctions imposed by some countries over its controversial nuclear programme.<br />
    <br />
    &nbsp;</p>

它还有助于使用UTF-8编码打开HttpConnection输入流,如下所示:

String encoding = "UTF-8";
Reader reader = new InputStreamReader(in, encoding);
使用此字符串util套件可以获得干净且格式良好的文本:

 /**
 * Method removes HTML tags from given string.
 *
 * @param text  Input parameter containing HTML tags (eg. <b>cat</b>)
 * @return      String without HTML tags (eg. cat)
 */
public static String removeHtml(String text) {
    try {
        int idx = text.indexOf("<");
        if (idx == -1) {
            text = decodeEntities(text);
            return text;
        }

        String plainText = "";
        String htmlText = text;
        int htmlStartIndex = htmlText.indexOf("<", 0);
        if (htmlStartIndex == -1) {
            return text;
        }
        htmlText = StringUtils.replace(htmlText, "</p>", "\r\n");
        htmlText = StringUtils.replace(htmlText, "<br/>", "\r\n");
        htmlText = StringUtils.replace(htmlText, "<br>", "\r\n");
        while (htmlStartIndex >= 0) {
            plainText += htmlText.substring(0, htmlStartIndex);
            int htmlEndIndex = htmlText.indexOf(">", htmlStartIndex);
            htmlText = htmlText.substring(htmlEndIndex + 1);
            htmlStartIndex = htmlText.indexOf("<", 0);
        }
        plainText = plainText.trim();
        plainText = decodeEntities(plainText);
        return plainText;
    } catch (Exception e) {
        System.err.println("Error while removing HTML: " + e.toString());
        return text;
    }
}

public static String decodeEntities(String html) {
    String result = StringUtils.replace(html, "&lt;", "<");
    result = StringUtils.replace(result, "&gt;", ">");
    result = StringUtils.replace(result, "&nbsp;", " ");
    result = StringUtils.replace(result, "&amp;", "&");
    result = StringUtils.replace(result, "&auml;", "ä");
    result = StringUtils.replace(result, "&ouml;", "ö");
    result = StringUtils.replace(result, "&quot;", "'");
    result = StringUtils.replace(result, "&lquot;", "'");
    result = StringUtils.replace(result, "&rquot;", "'");
    result = StringUtils.replace(result, "&#xd;", "\r");
    return result;
}

/* Replace all instances of a String in a String.
 *   @param  s  String to alter.
 *   @param  f  String to look for.
 *   @param  r  String to replace it with, or null to just remove it.
 */
public static String replace(String s, String f, String r) {
    if (s == null) {
        return s;
    }
    if (f == null) {
        return s;
    }
    if (r == null) {
        r = "";
    }
    int index01 = s.indexOf(f);
    while (index01 != -1) {
        s = s.substring(0, index01) + r + s.substring(index01 + f.length());
        index01 += r.length();
        index01 = s.indexOf(f, index01);
    }
    return s;
}

public static String cleanEncodedString(String str) {
    String resultStr = str;
    String encoding = "UTF-8";

    InputStream in = new ByteArrayInputStream(str.getBytes());
    InputStreamReader isr;
    try {
        isr = new InputStreamReader(in, encoding);

        ByteArrayOutputStream buf = new ByteArrayOutputStream();
        int result = isr.read();
        while (result != -1) {
            byte b = (byte) result;
            buf.write(b);
            result = isr.read();
        }
        resultStr = buf.toString();

        return resultStr;
    } catch (Exception uee) {
        uee.printStackTrace();
    }
    return resultStr;
}
/**
*方法从给定字符串中删除HTML标记。
*
*@param包含HTML标记的文本输入参数(例如cat)
*@不带HTML标记的返回字符串(如cat)
*/
公共静态字符串移除HTML(字符串文本){
试一试{
int idx=text.indexOf(“,htmlStartIndex);
htmlText=htmlText.substring(htmlEndIndex+1);
htmlStartIndex=htmlText.indexOf(“
String encoding = "UTF-8";
Reader reader = new InputStreamReader(in, encoding);
 /**
 * Method removes HTML tags from given string.
 *
 * @param text  Input parameter containing HTML tags (eg. <b>cat</b>)
 * @return      String without HTML tags (eg. cat)
 */
public static String removeHtml(String text) {
    try {
        int idx = text.indexOf("<");
        if (idx == -1) {
            text = decodeEntities(text);
            return text;
        }

        String plainText = "";
        String htmlText = text;
        int htmlStartIndex = htmlText.indexOf("<", 0);
        if (htmlStartIndex == -1) {
            return text;
        }
        htmlText = StringUtils.replace(htmlText, "</p>", "\r\n");
        htmlText = StringUtils.replace(htmlText, "<br/>", "\r\n");
        htmlText = StringUtils.replace(htmlText, "<br>", "\r\n");
        while (htmlStartIndex >= 0) {
            plainText += htmlText.substring(0, htmlStartIndex);
            int htmlEndIndex = htmlText.indexOf(">", htmlStartIndex);
            htmlText = htmlText.substring(htmlEndIndex + 1);
            htmlStartIndex = htmlText.indexOf("<", 0);
        }
        plainText = plainText.trim();
        plainText = decodeEntities(plainText);
        return plainText;
    } catch (Exception e) {
        System.err.println("Error while removing HTML: " + e.toString());
        return text;
    }
}

public static String decodeEntities(String html) {
    String result = StringUtils.replace(html, "&lt;", "<");
    result = StringUtils.replace(result, "&gt;", ">");
    result = StringUtils.replace(result, "&nbsp;", " ");
    result = StringUtils.replace(result, "&amp;", "&");
    result = StringUtils.replace(result, "&auml;", "ä");
    result = StringUtils.replace(result, "&ouml;", "ö");
    result = StringUtils.replace(result, "&quot;", "'");
    result = StringUtils.replace(result, "&lquot;", "'");
    result = StringUtils.replace(result, "&rquot;", "'");
    result = StringUtils.replace(result, "&#xd;", "\r");
    return result;
}

/* Replace all instances of a String in a String.
 *   @param  s  String to alter.
 *   @param  f  String to look for.
 *   @param  r  String to replace it with, or null to just remove it.
 */
public static String replace(String s, String f, String r) {
    if (s == null) {
        return s;
    }
    if (f == null) {
        return s;
    }
    if (r == null) {
        r = "";
    }
    int index01 = s.indexOf(f);
    while (index01 != -1) {
        s = s.substring(0, index01) + r + s.substring(index01 + f.length());
        index01 += r.length();
        index01 = s.indexOf(f, index01);
    }
    return s;
}

public static String cleanEncodedString(String str) {
    String resultStr = str;
    String encoding = "UTF-8";

    InputStream in = new ByteArrayInputStream(str.getBytes());
    InputStreamReader isr;
    try {
        isr = new InputStreamReader(in, encoding);

        ByteArrayOutputStream buf = new ByteArrayOutputStream();
        int result = isr.read();
        while (result != -1) {
            byte b = (byte) result;
            buf.write(b);
            result = isr.read();
        }
        resultStr = buf.toString();

        return resultStr;
    } catch (Exception uee) {
        uee.printStackTrace();
    }
    return resultStr;
}