/**
* Copyright (c) 2006 by Know-Center, Graz, Austria
*
* This software is the confidential and proprietary information of Know-Center,
* Graz, Austria. You shall not disclose such Confidential Information and shall
* use it only in accordance with the terms of the license agreement you entered
* into with Know-Center.
*
* KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
* NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY
* LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS
* DERIVATIVES.
*
* $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $
*/
package at.knowcenter.wag.egov.egiz.pdf;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.PDResources;
import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.pdfbox.util.PDFTextStripper;
import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException;
import at.gv.egiz.pdfas.framework.input.PdfDataSource;
import at.gv.egiz.pdfas.performance.PerformanceCounters;
import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfWriter;
/**
* Contains helper function for textual signatures.
*
* @author wprinz
*/
public class TextualSignature
{
/**
* The logger definition.
*/
private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class);
/**
* Extracts the document text from a given pdf.
*
* @param pdf_stream
* The pdf_input stream.
* @return Returns the extracted document text.
* @throws TextExtractionException
* Forwarded exception.
*/
public static String extractTextTextual(PdfDataSource pdfDataSource) throws TextExtractionException
{
PerformanceCounters.textExtractions.increment();
try
{
int first_page_rotation = 0;
// logger_.debug("====================================================");
// logger_.debug("extractText:");
// For text extraction, create a temporary object with iText just as the
// one
// created
// when being signed, but of course without adding content.
// byte[] bytes = normalizePDF(pdf_stream);
//iText
byte [] pdf_data = pdfDataSource.getAsByteArray();
PdfReader reader = new PdfReader(pdf_data);
//pdf_stream.close();
// PERF: PDF normalization needs byte array - this is costy
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
// For some reason the Reader -> ImportPage -> Writer mechanism produces
// problems en mass.
// The text extractor may not be able to extract proper text from
// documents
// created with
// this method (although it works when a Table is appended)... very
// fragile.
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, baos);
document.open();
PdfContentByte cb = writer.getDirectContent();
for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
{
//Rectangle new_size = reader.getPageSize(page_num);
//logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size);
//document.setPageSize(new_size);
Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
if (page_num == 1)
{
//setFirstPageRotation(new_size_withrot.getRotation());
first_page_rotation = new_size_withrot.getRotation();
//logger_.info("iText first_page_rotation="+new_size_withrot.getRotation());
}
//logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot);
//document.setPageSize(new_size);
document.setPageSize(new_size_withrot);
document.newPage();
PdfImportedPage page = writer.getImportedPage(reader, page_num);
// note that this will add an xobject form to the doc.
// the xobject form contains the content of the page.
cb.addTemplate(page, 0, 0);
// wprinz: debugging
// cb.beginText();
// cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
// BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
// cb.showText("page " + page_num);
// cb.endText();
// wprinz: end debugging
}
document.close();
// for (int i = 1; i <= reader.getNumberOfPages(); i++)
// {
// Rectangle rect = reader.getBoxSize(i, "bleed");
// logger_.debug("rect[" + i + "] = " + rect);
// }
baos.close();
byte[] normalizedPDF = baos.toByteArray();
ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF);
//PDFBox-parser
PDFParser parser = new PDFParser(bais);
File temporary_dir = SettingsReader.getTemporaryDirectory();
//logger_.info("temporary_dir="+temporary_dir.getAbsolutePath());
parser.setTempDirectory(temporary_dir);
parser.parse();
PDDocument doc = parser.getPDDocument();
//System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages());
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false);
stripper.setGetFirstPageRotationFromThis(true);
stripper.setFirstPageRotation(first_page_rotation);
// stripper.setStartPage(4);
// stripper.setEndPage(4);
logger_.debug("TextualSignator extractTextTextual: Begin stripping text");
String text = stripper.getText(doc);
logger_.debug("TextualSignator extractTextTextual: Stripping text ended");
doc.close();
//logger_.debug("TextualSignator extractTextTextual="+text);
return text;
}
catch (IOException e)
{
throw new TextExtractionException(e);
}
catch (DocumentException e)
{
throw new TextExtractionException(e);
}
}
/**
* Extract non textual data from pdf.
* @param pdfDataSource
*
* @see org.pdfbox.ExtractImages
*
* @return List of {@link NonTextObjectInfo}
*/
public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
PDDocument doc = null;
try {
doc = PDDocument.load(pdfDataSource.createInputStream());
List res = extractNonTextInfo(doc);
doc.close();
return res;
} catch (IOException e) {
logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
return new ArrayList();
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException e) {
logger_.error("error closing pddocument", e);
}
}
}
}
/**
* Extract non textual data from pdf.
*
* @see org.pdfbox.ExtractImages
*
* @param document
* @return List of {@link NonTextObjectInfo}
*/
public static List extractNonTextInfo(PDDocument document) {
// extraction does not work with the normalized pdf from extractTextTextual
logger_.debug("going to extract non text objects");
List objectInfos = new ArrayList();
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int pageNr = 0;
while (iter.hasNext()) {
pageNr++;
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map images;
try {
images = resources.getImages();
} catch (IOException e) {
logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
return objectInfos;
}
if (images != null) {
Iterator imageIter = images.keySet().iterator();
while (imageIter.hasNext()) {
NonTextObjectInfo objInfo = new NonTextObjectInfo();
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) images.get(key);
objInfo.setHeight(image.getHeight());
objInfo.setWidth(image.getWidth());
objInfo.setName(key + image.getSuffix());
objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
objInfo.setPageNr(pageNr);
objectInfos.add(objInfo);
if (logger_.isDebugEnabled()) {
logger_.debug("Found non text object: " + objInfo.toString());
}
}
}
}
if (logger_.isDebugEnabled()) {
logger_.debug("extracted non textual objects count: " + objectInfos.size());
}
logger_.debug("going to extract non text objects");
return objectInfos;
}
/**
* Normalizes a given binary PDF to a version PDFbox can handle correctly.
*
*
* PDFbox has serious problems with documents that use incremental updates or
* XObject forms. Therefor use this to remove incremental updates and create a
* streamlined document.
*
*
*
* Note that this has nothing to do with text normalization. It just unifies
* the PDF documents that are fed into PDFbox for text extraction and page
* length determination.
*
*
* @param input_pdf
* The input pdf to be normalized.
* @return Returns the normalized pdf.
* @throws IOException
* @throws DocumentException
*/
public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException
{
//iText
byte [] pdf_data = pdfDataSource.getAsByteArray();
PdfReader reader = new PdfReader(pdf_data);
//input_pdf.close();
// PERF: PDF Normalization needs byte array
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// For some reason the Reader -> ImportPage -> Writer mechanism produces
// problems en mass.
// The text extractor may not be able to extract proper text from
// documents
// created with
// this method (although it works when a Table is appended)... very
// fragile.
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, baos);
document.open();
PdfContentByte cb = writer.getDirectContent();
for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
{
Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
document.setPageSize(new_size_withrot);
document.newPage();
PdfImportedPage page = writer.getImportedPage(reader, page_num);
// note that this will add an xobject form to the doc.
// the xobject form contains the content of the page.
cb.addTemplate(page, 0, 0);
// wprinz: debugging
// cb.beginText();
// cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
// BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
// cb.showText("page " + page_num);
// cb.endText();
// wprinz: end debugging
}
document.close();
// for (int i = 1; i <= reader.getNumberOfPages(); i++)
// {
// Rectangle rect = reader.getBoxSize(i, "bleed");
// logger_.debug("rect[" + i + "] = " + rect);
// }
baos.close();
byte[] normalizedPDF = baos.toByteArray();
return normalizedPDF;
}
}