package org.eaglei.lexical.lucene;

import java.io.IOException;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;

/**
 * Wrapper class for a Lucene index that is used for entity extraction and auto-suggestion.
 * 
 * The following index schema is used:
 * <ul>
 * <li>uri: Stored, non-analyzed field that holds the URI of the resource in the index. If an ontology is being
 *          indexed, this is the class URI of the class. For an index of instances, this is the
 *          resource instance. Other types of objects (e.g. historical searches) will be
 *          represented as RDF resources with URIs.
 * <li>label: Stored, analyzed, multi-valued field that holds all labels for the resource. 
 *            This is the source of text against with entity extraction and auto-suggestions
 *            are made.
 * <li>pref_label: Stored, non-analyzed field that holds the preferred label of the resource. 
 * <li>pref_label: Stored, non-analyzed, multi-valued field that holds all type URIs for the
 *                 resource. There may be no values here.  
 * </ul>
 * 
 * @author rfrost
 */
public abstract class LuceneEntityExtractionIndexer {

    public static final String URI = "uri";
    public static final String LABEL = "label";
    public static final String PREF_LABEL = "pref_label";
    public static final String TYPE = "type";
    
    private Analyzer indexAnalyzer;
    private Directory directory;

    /**
     * Creates a new LuceneIndexer using the same analyzer for index creation and query execution.
     */
    public LuceneEntityExtractionIndexer(final Analyzer analyzer, final Directory directory) {
        this.indexAnalyzer = analyzer;
        this.directory = directory;
    }

    public Analyzer getIndexAnalyzer() {
        return this.indexAnalyzer;
    }
    
    public Directory getDirectory() {
        return this.directory;
    }

    /**
     * Populates the Lucene index.
     */
    public void index() throws IOException {
        final IndexWriter iwriter = new IndexWriter(directory, indexAnalyzer, true,
                IndexWriter.MaxFieldLength.LIMITED);
        addDocuments(iwriter);
        iwriter.optimize();
        iwriter.close();
    }

    /**
     * Adds the documents to the IndexWriter.
     * 
     * @param iwriter
     */
    protected abstract void addDocuments(IndexWriter iwriter) throws IOException;

    
    /**
     * Adds or updates the document for the specified URI.
     * @param uri URI of the resource
     * @param prefLabel The preferred label
     * @param labels All labels to index
     * @param typeURIs All type URIs
     * @param iwriter Index writer
     * @throws IOException Thrown if an error occurs
     */
    protected void updateDocument(final String uri, final String prefLabel, 
            final List<String> labels, final List<String> typeURIs, final IndexWriter iwriter) throws IOException {
        
        final Document doc = new Document();

        // create a non-indexed field for the URI
        doc.add(new Field(URI, uri, Field.Store.YES, Field.Index.NO));
        // create a non-indexed field for the pref label
        if (prefLabel != null) {
            doc.add(new Field(PREF_LABEL, prefLabel, Field.Store.YES, Field.Index.NO));
        }
        if (labels != null) {
            for (String label: labels) {
                // create an indexed field with position offsets for computation of 
                // highlights
                doc.add(new Field(LABEL, label, Field.Store.YES,
                        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            }
        }
        if (typeURIs != null) {
            for (String type: typeURIs) {
                // create a non-indexed field for all types
                doc.add(new Field(TYPE, type, Field.Store.YES, Field.Index.NOT_ANALYZED));
            }
        }
        
        iwriter.updateDocument(new Term(URI, uri), doc);
    }
        

}
