diff options
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov')
-rw-r--r-- | src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java new file mode 100644 index 0000000..f19a2f4 --- /dev/null +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java @@ -0,0 +1,171 @@ +package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+
+import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * Method for object extraction from pdf documents.
+ * This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox!
+ * @author dferbas
+ *
+ */
+public class ObjectExtractor {
+ private static Logger log = Logger.getLogger(ObjectExtractor.class);
+
+ /**
+ * Find annotation objects in pdf documents
+ * @param objectInfos
+ * @param pageNr
+ * @param page
+ */
+ private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
+ List annotations;
+ try {
+ annotations = page.getAnnotations();
+ } catch (IOException e) {
+ log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
+ return;
+ }
+ for (Iterator it = annotations.iterator(); it.hasNext();) {
+ try {
+ PDAnnotation anno = (PDAnnotation) it.next();
+ log.debug("found annotation: " +anno);
+ String ft = anno.getDictionary().getNameAsString("FT");
+ if (ft != null && ft.equals("Sig")) { // skip signature widgets
+ log.debug("found signature widged, skip further extraction");
+ break;
+ }
+ NonTextObjectInfo objInfo = new NonTextObjectInfo();
+ objInfo.setName(anno.getDictionary().getString( "NM" ));
+ objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
+
+ String subtype = anno.getDictionary().getNameAsString("Subtype");
+ String subj = anno.getDictionary().getString("Subj");
+ if (subj != null) {
+ subtype+= "/" + subj;
+ }
+ objInfo.setSubType(subtype);
+
+ objInfo.setPageNr(pageNr);
+ objInfo.setHeight(anno.getRectangle().getHeight());
+ objInfo.setWidth(anno.getRectangle().getWidth());
+ objectInfos.add(objInfo);
+ } catch (Exception ex) {
+ log.info("error reading non text object info key " + ex);
+ }
+ }
+ }
+
+ /**
+ * Find resources (images) in pdf documents
+ * @param objectInfos
+ * @param pageNr
+ * @param page
+ */
+ private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
+ PDResources resources = page.getResources();
+
+ Map images;
+ try {
+ images = resources.getImages();
+ } catch (IOException e) {
+ log.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
+ return;
+ }
+ if (images != null) {
+ Iterator imageIter = images.keySet().iterator();
+ while (imageIter.hasNext()) {
+ NonTextObjectInfo objInfo = new NonTextObjectInfo();
+ String key = (String) imageIter.next();
+ PDXObjectImage image = (PDXObjectImage) images.get(key);
+ System.err.println(image);
+
+ objInfo.setHeight(image.getHeight());
+ objInfo.setWidth(image.getWidth());
+ objInfo.setName(key + image.getSuffix());
+ objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
+ objInfo.setPageNr(pageNr);
+ objectInfos.add(objInfo);
+ objInfo.setSubType(image.getSuffix());
+
+ if (log.isDebugEnabled()) {
+ log.debug("Found non text object: " + objInfo.toString());
+ }
+ }
+ }
+ }
+
+ /**
+ * Extract non textual data from pdf.
+ *
+ * @see org.pdfbox.ExtractImages
+ *
+ * @param document
+ * @return List of {@link NonTextObjectInfo}
+ */
+ public static List extractNonTextInfo(PDDocument document) {
+ // extraction does not work with the normalized pdf from extractTextTextual
+ log.debug("going to extract non text objects");
+ List objectInfos = new ArrayList();
+ List pages = document.getDocumentCatalog().getAllPages();
+ Iterator iter = pages.iterator();
+ int pageNr = 0;
+ while (iter.hasNext()) {
+ pageNr++;
+ PDPage page = (PDPage) iter.next();
+ doExtractFromResources(objectInfos, pageNr, page);
+ doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
+ }
+ if (log.isDebugEnabled()) {
+ log.debug("extracted non textual objects count: " + objectInfos.size());
+ }
+ log.debug("going to extract non text objects");
+ return objectInfos;
+
+ }
+
+ /**
+ * Extract non textual data from pdf.
+ * @param pdfDataSource
+ *
+ * @see org.pdfbox.ExtractImages
+ *
+ * @return List of {@link NonTextObjectInfo}
+ */
+ public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
+
+ PDDocument doc = null;
+ try {
+ doc = PDDocument.load(pdfDataSource.createInputStream());
+ List res = extractNonTextInfo(doc);
+ doc.close();
+ return res;
+ } catch (IOException e) {
+ log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
+ return new ArrayList();
+ } finally {
+ if (doc != null) {
+ try {
+ doc.close();
+ } catch (IOException e) {
+ log.error("error closing pddocument", e);
+ }
+ }
+ }
+
+ }
+
+}
|