diff options
Diffstat (limited to 'src/main/java')
-rw-r--r-- | src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java | 107 |
1 files changed, 100 insertions, 7 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index 04b96fc..7f567c4 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -21,20 +21,25 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-
-import at.gv.egiz.pdfas.performance.PerformanceCounters;
-import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException;
-import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
import org.apache.log4j.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDPage;
+import org.pdfbox.pdmodel.PDResources;
+import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.pdfbox.util.PDFTextStripper;
+import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
+import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
-import at.knowcenter.wag.egov.egiz.exceptions.PresentableException;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
@@ -62,7 +67,7 @@ public class TextualSignature * @param pdf_stream
* The pdf_input stream.
* @return Returns the extracted document text.
- * @throws PresentableException
+ * @throws TextExtractionException
* Forwarded exception.
*/
public static String extractTextTextual(PdfDataSource pdfDataSource) throws TextExtractionException
@@ -168,6 +173,7 @@ public class TextualSignature logger_.debug("TextualSignator extractTextTextual: Begin stripping text");
String text = stripper.getText(doc);
logger_.debug("TextualSignator extractTextTextual: Stripping text ended");
+
doc.close();
//logger_.debug("TextualSignator extractTextTextual="+text);
return text;
@@ -182,6 +188,93 @@ public class TextualSignature throw new TextExtractionException(e);
}
}
+
+ /**
+ * Extract non textual data from pdf.
+ * @param pdfDataSource
+ *
+ * @see org.pdfbox.ExtractImages
+ *
+ * @return List of {@link NonTextObjectInfo}
+ */
+ public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
+
+ PDDocument doc = null;
+ try {
+ doc = PDDocument.load(pdfDataSource.createInputStream());
+ List res = extractNonTextInfo(doc);
+ doc.close();
+ return res;
+ } catch (IOException e) {
+ logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
+ return new ArrayList();
+ } finally {
+ if (doc != null) {
+ try {
+ doc.close();
+ } catch (IOException e) {
+ logger_.error("error closing pddocument", e);
+ }
+ }
+ }
+
+ }
+
+ /**
+ * Extract non textual data from pdf.
+ *
+ * @see org.pdfbox.ExtractImages
+ *
+ * @param document
+ * @return List of {@link NonTextObjectInfo}
+ */
+ public static List extractNonTextInfo(PDDocument document) {
+ // extraction does not work with the normalized pdf from extractTextTextual
+ logger_.debug("going to extract non text objects");
+ List objectInfos = new ArrayList();
+ List pages = document.getDocumentCatalog().getAllPages();
+ Iterator iter = pages.iterator();
+ int pageNr = 0;
+ while (iter.hasNext()) {
+ pageNr++;
+ PDPage page = (PDPage) iter.next();
+ PDResources resources = page.getResources();
+
+ Map images;
+ try {
+ images = resources.getImages();
+ } catch (IOException e) {
+ logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
+ return objectInfos;
+ }
+ if (images != null) {
+ Iterator imageIter = images.keySet().iterator();
+ while (imageIter.hasNext()) {
+ NonTextObjectInfo objInfo = new NonTextObjectInfo();
+ String key = (String) imageIter.next();
+ PDXObjectImage image = (PDXObjectImage) images.get(key);
+
+ objInfo.setHeight(image.getHeight());
+ objInfo.setWidth(image.getWidth());
+ objInfo.setName(key + image.getSuffix());
+ objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
+ objInfo.setPageNr(pageNr);
+ objectInfos.add(objInfo);
+
+ if (logger_.isDebugEnabled()) {
+ logger_.debug("Found non text object: " + objInfo.toString());
+ }
+ }
+ }
+ }
+ if (logger_.isDebugEnabled()) {
+ logger_.debug("extracted non textual objects count: " + objectInfos.size());
+ }
+ logger_.debug("going to extract non text objects");
+ return objectInfos;
+
+ }
+
/**
* Normalizes a given binary PDF to a version PDFbox can handle correctly.
|