Есть ли pdf pdf для извлечения текста, который извлекает арабский текст из pdf.
Я использую itextpdf api, он отлично работает при извлечении английского, но не извлекает арабский текст.
Это мой коддля извлечения текста в pdf:
private String extractPDF(String path) throws IOException {
String parsedText = "";
PdfReader reader = new PdfReader(path);
int n = reader.getNumberOfPages();
for (int page = 0; page < n; page++) {
parsedText = parsedText + PdfTextExtractor.getTextFromPage(reader, page + 1).trim() + "\n"; //Extracting the content from the different pages
}
reader.close();
return parsedText;
}
и это входные данные pdf: arabic.pdf
Обновление:
я могу извлечь арабскийтекст, но не сохраняет порядок строк, и это мой код:
private String extractPDF(String name) throws IOException {
PdfReader reader = new PdfReader(name);
StringBuilder text = new StringBuilder();
for (int i=1;i<=reader.getNumberOfPages();i++){
String data = PdfTextExtractor.getTextFromPage(reader,i,new SimpleTextExtractionStrategy());
text.append(Bidi.BidiText(data,1).getText());
}
return text.toString();
}
pdf текст:
بسم الله الرحمن الرحيم
السلام عليكمورحمة الله وبركاته
* тысяча двадцать-одина * سبحان الله
выход:
1 025 * سبحان الله * одна тысяча двадцать шесть *
السلام عليكم ورحمة الله وبركاته * тысяча двадцать восемь *
بسم الله الرحمن الرحيم
это мой код для метода BidiText:
public static BidiResult BidiText(String str, int startLevel)
{
boolean isLtr = true;
int strLength = str.length();
if (strLength == 0)
{
return new BidiResult(str, false);
}
// get types, fill arrays
char[] chars = new char[strLength];
String[] types = new String[strLength];
String[] oldtypes = new String[strLength];
int numBidi = 0;
for (int i = 0; i < strLength; ++i)
{
chars[i] = str.charAt(i);
char charCode = str.charAt(i);
String charType = "L";
if (charCode <= 0x00ff)
{
charType = BaseTypes[charCode];
}
else if (0x0590 <= charCode && charCode <= 0x05f4)
{
charType = "R";
}
else if (0x0600 <= charCode && charCode <= 0x06ff)
{
charType = ArabicTypes[charCode & 0xff];
}
else if (0x0700 <= charCode && charCode <= 0x08AC)
{
charType = "AL";
}
if (charType.equals("R") || charType.equals("AL") || charType.equals("AN"))
{
numBidi++;
}
oldtypes[i] = types[i] = charType;
}
if (numBidi == 0)
{
return new BidiResult(str, true);
}
if (startLevel == -1)
{
if ((strLength / numBidi) < 0.3)
{
startLevel = 0;
}
else
{
isLtr = false;
startLevel = 1;
}
}
int[] levels = new int[strLength];
for (int i = 0; i < strLength; ++i)
{
levels[i] = startLevel;
}
String e = IsOdd(startLevel) ? "R" : "L";
String sor = e;
String eor = sor;
String lastType = sor;
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("NSM"))
{
types[i] = lastType;
}
else
{
lastType = types[i];
}
}
lastType = sor;
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("EN"))
{
types[i] = (lastType.equals("AL")) ? "AN" : "EN";
}
else if (t.equals("R") || t.equals("L") || t.equals("AL"))
{
lastType = t;
}
}
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("AL"))
{
types[i] = "R";
}
}
for (int i = 1; i < strLength - 1; ++i)
{
if (types[i].equals("ES") && types[i - 1].equals("EN") && types[i + 1].equals("EN"))
{
types[i] = "EN";
}
if (types[i].equals("CS") && (types[i - 1].equals("EN") || types[i - 1].equals("AN")) && types[i + 1] == types[i - 1])
{
types[i] = types[i - 1];
}
}
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("EN"))
{
// do before
for (int j = i - 1; j >= 0; --j)
{
if (!types[j].equals("ET"))
{
break;
}
types[j] = "EN";
}
// do after
for (int j = i + 1; j < strLength; --j)
{
if (!types[j].equals("ET"))
{
break;
}
types[j] = "EN";
}
}
}
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("WS") || t.equals("ES") || t.equals("ET") || t.equals("CS"))
{
types[i] = "ON";
}
}
lastType = sor;
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (t.equals("EN"))
{
types[i] = (lastType.equals("L")) ? "L" : "EN";
}
else if (t.equals("R") || t.equals("L"))
{
lastType = t;
}
}
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("ON"))
{
int end = FindUnequal(types, i + 1, "ON");
String before = sor;
if (i > 0)
{
before = types[i - 1];
}
String after = eor;
if (end + 1 < strLength)
{
after = types[end + 1];
}
if (!before.equals("L"))
{
before = "R";
}
if (!after.equals("L"))
{
after = "R";
}
if (before == after)
{
SetValues(types, i, end, before);
}
i = end - 1; // reset to end (-1 so next iteration is ok)
}
}
for (int i = 0; i < strLength; ++i)
{
if (types[i].equals("ON"))
{
types[i] = e;
}
}
for (int i = 0; i < strLength; ++i)
{
String t = types[i];
if (IsEven(levels[i]))
{
if (t.equals("R"))
{
levels[i] += 1;
}
else if (t.equals("AN") || t.equals("EN"))
{
levels[i] += 2;
}
}
else
{
if (t.equals("L") || t.equals("AN") || t.equals("EN"))
{
levels[i] += 1;
}
}
}
int highestLevel = -1;
int lowestOddLevel = 99;
int ii = levels.length;
for (int i = 0; i < ii; ++i)
{
int level = levels[i];
if (highestLevel < level)
{
highestLevel = level;
}
if (lowestOddLevel > level && IsOdd(level))
{
lowestOddLevel = level;
}
}
for (int level = highestLevel; level >= lowestOddLevel; --level)
{
int start = -1;
ii = levels.length;
for (int i = 0; i < ii; ++i)
{
if (levels[i] < level)
{
if (start >= 0)
{
chars = ReverseValues(chars, start, i);
start = -1;
}
}
else if (start < 0)
{
start = i;
}
}
if (start >= 0)
{
chars = ReverseValues(chars, start, levels.length);
}
}
String result = "";
ii = chars.length;
for (int i = 0; i < ii; ++i)
{
char ch = chars[i];
if (ch != '<' && ch != '>')
{
result += ch;
}
}
return new BidiResult(result, isLtr);
}