Android 阿拉伯文PDF文本提取器

Android 阿拉伯文PDF文本提取器,android,itext,Android,Itext,是否有任何pdf文本提取器api可以从pdf中提取阿拉伯文本 我使用的是itextpdf api,它可以很好地提取英语,但不能提取阿拉伯语文本。 这是我在pdf中提取文本的代码: private String extractPDF(String path) throws IOException { String parsedText = ""; PdfReader reader = new PdfReader(path); int n = rea


我使用的是itextpdf api,它可以很好地提取英语,但不能提取阿拉伯语文本。

private String extractPDF(String path) throws IOException {

        String parsedText = "";
        PdfReader reader = new PdfReader(path);
        int n = reader.getNumberOfPages();
        for (int page = 0; page < n; page++) {
            parsedText = parsedText + PdfTextExtractor.getTextFromPage(reader, page + 1).trim() + "\n"; //Extracting the content from the different pages

        return parsedText;
private String extractPDF(字符串路径)引发IOException{
PdfReader reader=新PdfReader(路径);
int n=reader.getNumberOfPages();
对于(int page=0;page



private String extractPDF(String name) throws IOException {

    PdfReader reader = new PdfReader(name);
    StringBuilder text = new StringBuilder();
    for (int i=1;i<=reader.getNumberOfPages();i++){
        String data = PdfTextExtractor.getTextFromPage(reader,i,new SimpleTextExtractionStrategy());
    return text.toString();
private String extractPDF(字符串名称)引发IOException{
PdfReader reader=新PdfReader(名称);
StringBuilder text=新的StringBuilder();

对于(int i=1;i您的示例PDF根本不包含任何文本,它只包含嵌入的文本位图图像





public static BidiResult BidiText(String str, int startLevel)
    boolean isLtr = true;
    int strLength = str.length();
    if (strLength == 0)
        return new BidiResult(str, false);

    // get types, fill arrays

    char[] chars = new char[strLength];
    String[] types = new String[strLength];
    String[] oldtypes = new String[strLength];
    int numBidi = 0;

    for (int i = 0; i < strLength; ++i)
        chars[i] = str.charAt(i);

        char charCode = str.charAt(i);
        String charType = "L";
        if (charCode <= 0x00ff)
            charType = BaseTypes[charCode];
        else if (0x0590 <= charCode && charCode <= 0x05f4)
            charType = "R";
        else if (0x0600 <= charCode && charCode <= 0x06ff)
            charType = ArabicTypes[charCode & 0xff];
        else if (0x0700 <= charCode && charCode <= 0x08AC)
            charType = "AL";

        if (charType.equals("R") || charType.equals("AL") || charType.equals("AN"))

        oldtypes[i] = types[i] = charType;

    if (numBidi == 0)
        return new BidiResult(str, true);

    if (startLevel == -1)
        if ((strLength / numBidi) < 0.3)
            startLevel = 0;
            isLtr = false;
            startLevel = 1;

    int[] levels = new int[strLength];

    for (int i = 0; i < strLength; ++i)
        levels[i] = startLevel;

    String e = IsOdd(startLevel) ? "R" : "L";
    String sor = e;
    String eor = sor;

    String lastType = sor;
    for (int i = 0; i < strLength; ++i)
        if (types[i].equals("NSM"))
            types[i] = lastType;
            lastType = types[i];

    lastType = sor;
    for (int i = 0; i < strLength; ++i)

        String t = types[i];
        if (t.equals("EN"))
            types[i] = (lastType.equals("AL")) ? "AN" : "EN";
        else if (t.equals("R") || t.equals("L") || t.equals("AL"))
            lastType = t;

    for (int i = 0; i < strLength; ++i)

        String t = types[i];
        if (t.equals("AL"))
            types[i] = "R";

    for (int i = 1; i < strLength - 1; ++i)
        if (types[i].equals("ES") && types[i - 1].equals("EN") && types[i + 1].equals("EN"))
            types[i] = "EN";
        if (types[i].equals("CS") && (types[i - 1].equals("EN") || types[i - 1].equals("AN")) && types[i + 1] == types[i - 1])
            types[i] = types[i - 1];

    for (int i = 0; i < strLength; ++i)
        if (types[i].equals("EN"))
            // do before
            for (int j = i - 1; j >= 0; --j)
                if (!types[j].equals("ET"))
                types[j] = "EN";
            // do after
            for (int j = i + 1; j < strLength; --j)
                if (!types[j].equals("ET"))
                types[j] = "EN";

    for (int i = 0; i < strLength; ++i)

        String t = types[i];
        if (t.equals("WS") || t.equals("ES") || t.equals("ET") || t.equals("CS"))
            types[i] = "ON";

    lastType = sor;
    for (int i = 0; i < strLength; ++i)

        String t = types[i];
        if (t.equals("EN"))
            types[i] = (lastType.equals("L")) ? "L" : "EN";
        else if (t.equals("R") || t.equals("L"))
            lastType = t;

    for (int i = 0; i < strLength; ++i)
        if (types[i].equals("ON"))

            int end = FindUnequal(types, i + 1, "ON");

            String before = sor;
            if (i > 0)
                before = types[i - 1];

            String after = eor;
            if (end + 1 < strLength)
                after = types[end + 1];
            if (!before.equals("L"))
                before = "R";
            if (!after.equals("L"))
                after = "R";
            if (before == after)
                SetValues(types, i, end, before);
            i = end - 1; // reset to end (-1 so next iteration is ok)

    for (int i = 0; i < strLength; ++i)
        if (types[i].equals("ON"))
            types[i] = e;

    for (int i = 0; i < strLength; ++i)

        String t = types[i];
        if (IsEven(levels[i]))
            if (t.equals("R"))
                levels[i] += 1;
            else if (t.equals("AN") || t.equals("EN"))
                levels[i] += 2;
            if (t.equals("L") || t.equals("AN") || t.equals("EN"))
                levels[i] += 1;

    int highestLevel = -1;
    int lowestOddLevel = 99;
    int ii = levels.length;
    for (int i = 0; i < ii; ++i)

        int level = levels[i];
        if (highestLevel < level)
            highestLevel = level;
        if (lowestOddLevel > level && IsOdd(level))
            lowestOddLevel = level;

    for (int level = highestLevel; level >= lowestOddLevel; --level)

        int start = -1;
        ii = levels.length;
        for (int i = 0; i < ii; ++i)
            if (levels[i] < level)
                if (start >= 0)
                    chars = ReverseValues(chars, start, i);
                    start = -1;
            else if (start < 0)
                start = i;
        if (start >= 0)
            chars = ReverseValues(chars, start, levels.length);

    String result = "";
    ii = chars.length;
    for (int i = 0; i < ii; ++i)

        char ch = chars[i];
        if (ch != '<' && ch != '>')
            result += ch;

    return new BidiResult(result, isLtr);