From c68ad0ec056b37c82debebcecfcde1866d61b4d9 Mon Sep 17 00:00:00 2001 From: tknall Date: Tue, 25 Nov 2008 12:03:13 +0000 Subject: Removing pdfbox from source. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@301 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../org/pdfbox/searchengine/lucene/IndexFiles.java | 308 ---------------- .../searchengine/lucene/LucenePDFDocument.java | 387 --------------------- .../org/pdfbox/searchengine/lucene/package.html | 9 - src/main/java/org/pdfbox/searchengine/package.html | 9 - 4 files changed, 713 deletions(-) delete mode 100644 src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java delete mode 100644 src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java delete mode 100644 src/main/java/org/pdfbox/searchengine/lucene/package.html delete mode 100644 src/main/java/org/pdfbox/searchengine/package.html (limited to 'src/main/java/org/pdfbox/searchengine') diff --git a/src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java b/src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java deleted file mode 100644 index dd36dd9..0000000 --- a/src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java +++ /dev/null @@ -1,308 +0,0 @@ -package org.pdfbox.searchengine.lucene; - -/* - * This source was originally written as an example for the lucene project. - * It has been modified to use PDFBox as a lucene document creator. - * -Ben Litchfield - * - *==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . - */ - -import org.apache.lucene.analysis.standard.StandardAnalyzer; - -import org.apache.lucene.demo.HTMLDocument; - -import org.apache.lucene.document.Document; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; - -import java.util.Arrays; - - -import java.io.File; -import java.io.IOException; - -import java.util.Date; - - -/** - * This is a class that will index some files on a local filesystem. This code - * was modified from a demo that comes with the lucene search engine. - * - * @author Lucene team - * @author Ben Litchfield (ben@csh.rit.edu) - * - * @version $Revision: 1.6 $ - */ -public class IndexFiles -{ - private boolean deleting = false; // true during deletion pass - private IndexReader reader; // existing index - private IndexWriter writer; // new index being built - private TermEnum uidIter; // document id iterator - - /** - * This is the main entry point for the indexer. - * - * @param argv The command line arguments. - */ - public static void main(String[] argv) - { - - String index = "index"; - boolean create = false; - File root = null; - - String usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index ] "; - - if (argv.length == 0) - { - System.err.println("Usage: " + usage); - return; - } - - for (int i = 0; i < argv.length; i++) - { - if (argv[i].equals("-index")) - { // parse -index option - index = argv[++i]; - } - else if (argv[i].equals("-create")) - { // parse -create option - create = true; - } - else if (i != argv.length-1) - { - System.err.println("Usage: " + usage); - return; - } - else - { - System.out.println( "root=" +argv[i] ); - root = new File(argv[i]); - } - } - IndexFiles indexer = new IndexFiles(); - indexer.index( root, create, index ); - } - - /** - * This will index a directory. - * - * @param root The root directory to start indexing. - * @param create Should we create a new index? - * @param index The name of the index. - */ - public void index( File root, boolean create, String index ) - { - - try - { - Date start = new Date(); - - writer = new IndexWriter(index, new StandardAnalyzer(), create); - writer.maxFieldLength = 1000000; - - if (!create) - { // delete stale docs - deleting = true; - indexDocs(root, index, create); - } - - indexDocs(root, index, create); // add new docs - - System.out.println("Optimizing index..."); - writer.optimize(); - writer.close(); - - Date end = new Date(); - - System.out.print(end.getTime() - start.getTime()); - System.out.println(" total milliseconds"); - - } - catch( Exception e ) - { - e.printStackTrace(); - } - } - - /** - * Walk directory hierarchy in uid order, while keeping uid iterator from - * existing index in sync. Mismatches indicate one of: (a) old documents to - * be deleted; (b) unchanged documents, to be left alone; or (c) new - * documents, to be indexed. - * - * @param file The directory to index. - * @param index The index to add the file to. - * @param create A flag telling if we should create the index. - * - * @throws Exception If there is any error indexing the directory. - */ - private void indexDocs(File file, String index, boolean create) throws Exception - { - if (!create) - { // incrementally update - - reader = IndexReader.open(index); // open existing index - uidIter = reader.terms(new Term("uid", "")); // init uid iterator - - indexDocs(file); - - if (deleting) - { // delete rest of stale docs - while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) ) - { - System.out.println("deleting " + - HTMLDocument.uid2url(uidIter.term().text())); - reader.delete(uidIter.term()); - uidIter.next(); - } - deleting = false; - } - - uidIter.close(); // close uid iterator - reader.close(); // close existing index - - } - else - { - indexDocs(file); - } - } - - - private void indexDocs(File file) throws Exception - { - if (file.isDirectory()) - { // if a directory - String[] files = file.list(); // list its files - Arrays.sort(files); // sort the files - for (int i = 0; i < files.length; i++) // recursively index them - { - indexDocs(new File(file, files[i])); - } - } - else - { - if (uidIter != null) - { - String uid = HTMLDocument.uid(file); // construct uid for doc - - while( uidIter.term() != null && - uidIter.term().field().equals( "uid" ) && - uidIter.term().text().compareTo(uid) < 0) - { - if (deleting) - { // delete stale docs - System.out.println("deleting " + - HTMLDocument.uid2url(uidIter.term().text())); - reader.delete(uidIter.term()); - } - uidIter.next(); - } - if( uidIter.term() != null && - uidIter.term().field().equals( "uid" ) && - uidIter.term().text().compareTo(uid) == 0) - { - System.out.println( "Next uid=" +uidIter ); - uidIter.next(); // keep matching docs - } - } - else - { - try - { - addDocument( file ); - } - catch( IOException e ) - { - //catch exception and move onto the next document - System.out.println( e.getMessage() ); - } - } - } - } - - private void addDocument( File file ) throws IOException, InterruptedException - { - String path = file.getName().toUpperCase(); - Document doc = null; - //Gee, this would be a great place for a command pattern - if( path.endsWith(".HTML") || // index .html files - path.endsWith(".HTM") || // index .htm files - path.endsWith(".TXT")) - { - System.out.println( "Indexing Text document: " + file ); - doc = HTMLDocument.Document(file); - } - else if( path.endsWith( ".PDF" ) ) - { - System.out.println( "Indexing PDF document: " + file ); - doc = LucenePDFDocument.getDocument( file ); - } - else - { - System.out.println( "Skipping " + file ); - } - - if( doc != null ) - { - writer.addDocument(doc); - } - } -} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java b/src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java deleted file mode 100644 index 1bb8d0c..0000000 --- a/src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java +++ /dev/null @@ -1,387 +0,0 @@ -/** - * Copyright (c) 2003, www.pdfbox.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. Neither the name of pdfbox; nor the names of its - * contributors may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * http://www.pdfbox.org - * - */ -package org.pdfbox.searchengine.lucene; - -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.IOException; -import java.io.StringReader; -import java.io.StringWriter; - -import java.net.URL; -import java.net.URLConnection; - -import java.util.Date; - -import org.apache.lucene.document.DateField; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import org.pdfbox.pdmodel.PDDocument; -import org.pdfbox.pdmodel.PDDocumentInformation; - -import org.pdfbox.exceptions.CryptographyException; -import org.pdfbox.exceptions.InvalidPasswordException; - -import org.pdfbox.util.PDFTextStripper; - -/** - * This class is used to create a document for the lucene search engine. - * This should easily plug into the IndexHTML or IndexFiles that comes with - * the lucene project. This class will populate the following fields. - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
Lucene Field NameDescription
pathFile system path if loaded from a file
urlURL to PDF document
contentsEntire contents of PDF document, indexed but not stored
summaryFirst 500 characters of content
modifiedThe modified date/time according to the url or path
uidA unique identifier for the Lucene document.
CreationDateFrom PDF meta-data if available
CreatorFrom PDF meta-data if available
KeywordsFrom PDF meta-data if available
ModificationDateFrom PDF meta-data if available
ProducerFrom PDF meta-data if available
SubjectFrom PDF meta-data if available
TrappedFrom PDF meta-data if available
- * - * @author Ben Litchfield - * @version $Revision: 1.18 $ - */ -public final class LucenePDFDocument -{ - private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); - - - /** - * private constructor because there are only static methods. - */ - private LucenePDFDocument() - { - //utility class should not be instantiated - } - - /** - * This will get a lucene document from a PDF file. - * - * @param is The stream to read the PDF from. - * - * @return The lucene document. - * - * @throws IOException If there is an error parsing or indexing the document. - */ - public static Document getDocument( InputStream is ) throws IOException - { - Document document = new Document(); - addContent( document, is, "" ); - return document; - } - - /** - * This will get a lucene document from a PDF file. - * - * @param file The file to get the document for. - * - * @return The lucene document. - * - * @throws IOException If there is an error parsing or indexing the document. - */ - public static Document getDocument( File file ) throws IOException - { - Document document = new Document(); - - // Add the url as a field named "url". Use an UnIndexed field, so - // that the url is just stored with the document, but is not searchable. - document.add( Field.UnIndexed("path", file.getPath() ) ); - document.add(Field.UnIndexed("url", file.getPath().replace(FILE_SEPARATOR, '/'))); - - // Add the last modified date of the file a field named "modified". Use a - // Keyword field, so that it's searchable, but so that no attempt is made - // to tokenize the field into words. - document.add(Field.Keyword("modified", DateField.timeToString( file.lastModified() ))); - - String uid = file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + - DateField.timeToString(file.lastModified() ); - - // Add the uid as a field, so that index can be incrementally maintained. - // This field is not stored with document, it is indexed, but it is not - // tokenized prior to indexing. - document.add(new Field("uid", uid, false, true, false)); - - FileInputStream input = null; - try - { - input = new FileInputStream( file ); - addContent( document, input, file.getPath() ); - } - finally - { - if( input != null ) - { - input.close(); - } - } - - - // return the document - - return document; - } - - /** - * This will get a lucene document from a PDF file. - * - * @param url The file to get the document for. - * - * @return The lucene document. - * - * @throws IOException If there is an error parsing or indexing the document. - */ - public static Document getDocument( URL url ) throws IOException - { - Document document = new Document(); - URLConnection connection = url.openConnection(); - connection.connect(); - // Add the url as a field named "url". Use an UnIndexed field, so - // that the url is just stored with the document, but is not searchable. - document.add( Field.UnIndexed("url", url.toExternalForm() ) ); - - // Add the last modified date of the file a field named "modified". Use a - // Keyword field, so that it's searchable, but so that no attempt is made - // to tokenize the field into words. - document.add(Field.Keyword("modified", DateField.timeToString( connection.getLastModified()))); - - String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + - DateField.timeToString( connection.getLastModified() ); - - // Add the uid as a field, so that index can be incrementally maintained. - // This field is not stored with document, it is indexed, but it is not - // tokenized prior to indexing. - document.add(new Field("uid", uid, false, true, false)); - - InputStream input = null; - try - { - input = connection.getInputStream(); - addContent( document, input,url.toExternalForm() ); - } - finally - { - if( input != null ) - { - input.close(); - } - } - - // return the document - return document; - } - - /** - * This will add the contents to the lucene document. - * - * @param document The document to add the contents to. - * @param is The stream to get the contents from. - * @param documentLocation The location of the document, used just for debug messages. - * - * @throws IOException If there is an error parsing the document. - */ - private static void addContent( Document document, InputStream is, String documentLocation ) throws IOException - { - PDDocument pdfDocument = null; - try - { - pdfDocument = PDDocument.load( is ); - - - if( pdfDocument.isEncrypted() ) - { - //Just try using the default password and move on - pdfDocument.decrypt( "" ); - } - - //create a writer where to append the text content. - StringWriter writer = new StringWriter(); - PDFTextStripper stripper = new PDFTextStripper(); - stripper.writeText( pdfDocument, writer ); - - // Note: the buffer to string operation is costless; - // the char array value of the writer buffer and the content string - // is shared as long as the buffer content is not modified, which will - // not occur here. - String contents = writer.getBuffer().toString(); - - StringReader reader = new StringReader( contents ); - - // Add the tag-stripped contents as a Reader-valued Text field so it will - // get tokenized and indexed. - document.add( Field.Text( "contents", reader ) ); - - PDDocumentInformation info = pdfDocument.getDocumentInformation(); - if( info.getAuthor() != null ) - { - document.add(Field.Text( "Author", info.getAuthor() ) ); - } - if( info.getCreationDate() != null ) - { - Date date = info.getCreationDate().getTime(); - //for some reason lucene cannot handle dates before the epoch - //and throws a nasty RuntimeException, so we will check and - //verify that this does not happen - if( date.getTime() >= 0 ) - { - document.add(Field.Text("CreationDate", DateField.dateToString( date ) ) ); - } - } - if( info.getCreator() != null ) - { - document.add( Field.Text( "Creator", info.getCreator() ) ); - } - if( info.getKeywords() != null ) - { - document.add( Field.Text( "Keywords", info.getKeywords() ) ); - } - if( info.getModificationDate() != null ) - { - Date date = info.getModificationDate().getTime(); - //for some reason lucene cannot handle dates before the epoch - //and throws a nasty RuntimeException, so we will check and - //verify that this does not happen - if( date.getTime() >= 0 ) - { - document.add(Field.Text("ModificationDate", DateField.dateToString( date ) ) ); - } - } - if( info.getProducer() != null ) - { - document.add( Field.Text( "Producer", info.getProducer() ) ); - } - if( info.getSubject() != null ) - { - document.add( Field.Text( "Subject", info.getSubject() ) ); - } - if( info.getTitle() != null ) - { - document.add( Field.Text( "Title", info.getTitle() ) ); - } - if( info.getTrapped() != null ) - { - document.add( Field.Text( "Trapped", info.getTrapped() ) ); - } - - int summarySize = Math.min( contents.length(), 500 ); - String summary = contents.substring( 0, summarySize ); - // Add the summary as an UnIndexed field, so that it is stored and returned - // with hit documents for display. - document.add( Field.UnIndexed( "summary", summary ) ); - } - catch( CryptographyException e ) - { - throw new IOException( "Error decrypting document(" + documentLocation + "): " + e ); - } - catch( InvalidPasswordException e ) - { - //they didn't suppply a password and the default of "" was wrong. - throw new IOException( "Error: The document(" + documentLocation + - ") is encrypted and will not be indexed." ); - } - finally - { - if( pdfDocument != null ) - { - pdfDocument.close(); - } - } - } - - /** - * This will test creating a document. - * - * usage: java pdfparser.searchengine.lucene.LucenePDFDocument <pdf-document> - * - * @param args command line arguments. - * - * @throws IOException If there is an error. - */ - public static void main( String[] args ) throws IOException - { - if( args.length != 1 ) - { - System.err.println( "usage: java org.pdfbox.searchengine.lucene.LucenePDFDocument " ); - System.exit( 1 ); - } - System.out.println( "Document=" + getDocument( new File( args[0] ) ) ); - } -} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/searchengine/lucene/package.html b/src/main/java/org/pdfbox/searchengine/lucene/package.html deleted file mode 100644 index fbf3a38..0000000 --- a/src/main/java/org/pdfbox/searchengine/lucene/package.html +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - -This package holds classes that are used to integrate the PDFBox project with lucene. - - diff --git a/src/main/java/org/pdfbox/searchengine/package.html b/src/main/java/org/pdfbox/searchengine/package.html deleted file mode 100644 index 1cb4629..0000000 --- a/src/main/java/org/pdfbox/searchengine/package.html +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - -Classes that are used to integrate PDFBox with a search engine are located here. - - -- cgit v1.2.3