package org.eaglei.search.provider.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;

import org.eaglei.model.EIClass;
import org.eaglei.model.EIEntity;
import org.eaglei.model.EIOntModel;
import org.eaglei.model.EIURI;
import org.eaglei.model.EagleIOntConstants;
import org.eaglei.model.jena.EagleIOntUtils;
import org.eaglei.model.jena.JenaEIOntModel;
import org.eaglei.search.provider.SearchResult;
import org.eaglei.search.datagen.AbstractGenerator;

import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.vocabulary.RDF;

/**
 * Creates a Lucene index for eagle-i RDF resource data. A Lucene document is created for each resource.
 * Each document contains the following field corresponding to the direct resource data type and object properties
 * as well as fields corresponding to indirect relationships: inferred types and textual descriptions of resource related
 * by object properties. 
 * <ul>
 * <li>uri: The unique URI of the resource. Stored and indexed but not analyzed.
 * <li><property_uri>: The text value for each object and datatype property is stored in a multi-valued field using 
 *                     the property URI. Stored, indexed and analyzed. For data type properties, the value is the
 *                     property value. For object properties, the value is the label of the property resource.
 * <li><property_uri>_uri: For object properties, this holds the resource URI in a multi-valued field. 
 *                     Stored and indexed.
 * <li>pref_text: Multi-valued field that holds the lexical form of all preferred datatype properties.
 *          A boost is given to this field. Stored, indexed and analyzed. 
 * <li>text: Multi-valued field that holds the lexical form of non-preferred datatype properties. Stored, indexed and analyzed.
 * <li>indirect_text: Multi-valued field that holds the lexical form of 
 *          datatype properties of resource related by object properties. 
 *          Stored, indexed and analyzed.
 * <li>resourceFlag: Field that holds a boolean value indicating whether the resource is an eagle-i resource
 *           (i.e. subclass of a top-level resource) or an instance of a non-resource class (e.g. technique). 
 *           Stored and indexed but not analyzed.
 * <li>inferredType: Multi-valued field that holds the URIs of all inferred classes. 
 *           Stored and indexed but not analyzed.
 * <li>related: Multi-valued field that holds the URIs of all object properties. 
 *              Stored and indexed but not analyzed.
 * <li>institution_uri: Holds the URI of the institution. Stored and indexed but not analyzed.
 * <li>institution_label: Holds the name of the institution. Stored, indexed and analyzed.
 * </ul>
 * 
 * @author frost
 */
public class LuceneSearchProviderIndexer {

    private static final Log logger = LogFactory.getLog(LuceneSearchProviderIndexer.class);
    private static final boolean DEBUG = logger.isDebugEnabled();

    /**
     * Flag that controls whether the preferred labels of object properties are indexed/updated
     */
    protected static final boolean INDEX_OBJECT_PROP_LABELS = true;
    
    // Lucene field names (see class docs above for details)
    
    public final static String URI = "uri";
    public final static String PREF_TEXT = "pref_text";
    public final static String TEXT = "text";
    public final static String OBJECT_URI_POSTFIX= "_uri";    
    public final static String INDIRECT_TEXT = "indirect_text";    
    public final static String RESOURCE_FLAG = "resourceFlag";        
    public final static String INFERRED_TYPE= "inferredType";    
    public final static String RELATED = "related";        
    public final static String INSTITUTION_URI = "institution_uri";
    public final static String INSTITUTION_LABEL = "institution_label";
    
    // Field boost values
    
    public final static float LOW_BOOST = 0.5f;
    public final static float STANDARD_BOOST = 1.0f;
    public final static float MEDIUM_BOOST = 2.0f;
    public final static float HIGH_BOOST = 5.0f;

    // Preferred name properties
    private List<EIURI> prefLabelProperties = new ArrayList<EIURI>();
    
    private final EIOntModel eagleiOntModel;
    private final IndexWriter iwriter;

    /**
     * Creates the LuceneSearchProviderIndexer
     * 
     * @param eagleiOntModel Referenced to the eagle-i ontology
     * @param analyzer The Lucene analyzer that is used for indexing and searching.
     * @param directory The directory that holds the index.
     * 
     * @throws IOException Thrown if an error is encountered.
     */
    public LuceneSearchProviderIndexer(final EIOntModel eagleiOntModel, final Analyzer analyzer, final Directory directory) throws IOException {
        this.eagleiOntModel = eagleiOntModel;
        this.iwriter = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.LIMITED);
        retrieveOntologyMetadata();        
    }

    /*
     * Retrieves various metadata from the eagle-i ontology that is cached as
     * instance vars in this provider and reused on queries.
     */
    private void retrieveOntologyMetadata() {
        // properties used to compute preferred labels
        List<Property> props = ((JenaEIOntModel) eagleiOntModel).getPrefLabelProperties();
        for (Property prop: props) {
            this.prefLabelProperties.add(EIURI.create(prop.getURI()));
        }
    }
    
    /**
     * Retrieves the IndexWriter
     * @return
     */
    public IndexWriter getIndexWriter() {
        return this.iwriter;
    }

    /**
     * Commits any pending changes the changes
     * @throws IOException
     */
    public void commit() throws IOException {
        iwriter.optimize();
        iwriter.commit();
    }
    
    /**
     * Gets the EIURIs of all documents that reference this document via an 
     * object property.
     * @param uri URI of property whose referencing documents are being retrieved.
     * @return 
     * @throws IOException
     */
    public List<EIURI> getRelatedDocuments(final EIURI uri) throws IOException {
        // create a query 
        final BooleanQuery query = new BooleanQuery();
        final PhraseQuery propQuery = new PhraseQuery();
        propQuery.add(new Term(RELATED, uri.toString()));
        query.add(propQuery, BooleanClause.Occur.MUST);
        
        final IndexSearcher searcher = new IndexSearcher(this.iwriter.getDirectory(), true);
        searcher.setDefaultFieldSortScoring(true, true);

        // collector that grabs the URIs of all retrieved Documents
        final List<EIURI> uris = new ArrayList<EIURI>();
        final Collector collector = new Collector(){
            IndexReader reader;
            int docbase;
            public void setNextReader(IndexReader reader, int docbase) throws IOException {
                this.reader = reader;
                this.docbase = docbase;
            }
            public void collect(final int doc) throws IOException {
                Document document = this.reader.document(this.docbase + doc);
                uris.add(EIURI.create(document.get(URI)));
            }
            
            public boolean acceptsDocsOutOfOrder() {
                return true;
            }
            public void setScorer(Scorer scorer) throws IOException {
                // no-op
            }
        };
        // execute the search
        searcher.search(query, null, collector);
        return uris;        
    }
    
    /**
     * Removes the _uri postfix from the document field name.
     * @param fieldWithPostfix Field with the _uri postfix
     * @return Field name without the _uri postfix
     */
    protected static String stripObjectURIPostfix(final String fieldWithPostfix) {
        assert fieldWithPostfix != null;
        if (!fieldWithPostfix.endsWith(OBJECT_URI_POSTFIX)) {
            return fieldWithPostfix;
        }
        return fieldWithPostfix.substring(0, fieldWithPostfix.length() - OBJECT_URI_POSTFIX.length());
    }
    
    /**
     * Updates the document with the specified URI to add object property labels.
     * @param uri
     * @throws IOException
     */
    public void addIndirectProperties(final EIURI uri) throws IOException {
        //logger.debug("Adding indirect properties for " + uri);
        final Document document = getDocumentByURI(uri);
        if (document == null) {
            logger.debug("Failed to find " + uri + " in index");
            return;
        }
        
        // remove the indirect text field
        document.removeFields(INDIRECT_TEXT);
        
        // retrieve all of the properties that hold object prop URIs to value URIs
        Map<String, String> objectPropURIToValue = new HashMap<String, String>();
        for (Fieldable f: document.getFields()) {
            final String name = f.name();
            final String strValue = f.stringValue();
            // check name against the known fields
            if (strValue != null && LuceneSearchProvider.isPropertyField(name)) {
                if (!f.isTokenized()) {
                    // remove the postfix
                    final String objectPropURI = stripObjectURIPostfix(name);
                    objectPropURIToValue.put(objectPropURI, strValue);
                }
            }
        }
        // remove all properties that hold object prop resource text
        for (String propURI: objectPropURIToValue.keySet()) {
            document.removeFields(propURI);
        }
        
        // update the object prop resource text
        for (String propURI: objectPropURIToValue.keySet()) {
            final String propValue = objectPropURIToValue.get(propURI);
            // find the document for the object prop resource in the
            // index
            Document objectDoc = getDocumentByURI(EIURI.create(propValue));
            if (objectDoc != null) {
                for (Fieldable prefTextField: objectDoc.getFieldables(PREF_TEXT)) {
                    final String prefText = prefTextField.stringValue();
                    //logger.debug("Adding text " + prefText + " for object prop " + propURI);
                    
                    // add the pref text fields for this resource to the indirect_text fiels
                    Field indirectText= new Field(INDIRECT_TEXT, prefText, Field.Store.YES, Field.Index.ANALYZED);
                    indirectText.setBoost(LOW_BOOST);
                    document.add(indirectText);
                    
                    // add the pref text fields for this resource to a prop-specific field
                    Field objectPropLabel= new Field(propURI.toString(), prefText, Field.Store.YES, Field.Index.ANALYZED);
                    objectPropLabel.setBoost(LOW_BOOST);
                    document.add(objectPropLabel);                            
                }
            }
        }
        
        // update the document
        iwriter.updateDocument(new Term(URI, uri.toString()), document);
    }
    
    /*
     * Retrieves the relevant document by URI
     */
    private Document getDocumentByURI(final EIURI uri) throws IOException {
        // create a query 
        final PhraseQuery propQuery = new PhraseQuery();
        propQuery.add(new Term(URI, uri.toString()));
        
        final IndexSearcher searcher = new IndexSearcher(this.iwriter.getDirectory(), true);
        searcher.setDefaultFieldSortScoring(true, true);
        
        final TopDocs docs = searcher.search(propQuery, 1);
        if (docs.totalHits == 0) {
            //logger.error("Did not find " + uri + " in search index");
            return null;
        }
        final ScoreDoc scoreDoc = docs.scoreDocs[0];
        final Document document = searcher.doc(scoreDoc.doc);
        return document;
    }
    
    /*
     * if there is a document with the specified URI, remove from index
     */
    private void deleteDocumentByURI(final EIURI uri) throws IOException {
        final PhraseQuery query = new PhraseQuery();
        query.add(new Term(URI, uri.toString()));
        this.iwriter.deleteDocuments(query);        
    }
    
    /**
     * Checks if this SearchResult represents a deleted resource. 
     * @return True if it represents a deleted resource.
     */
    protected static boolean isDeletedSearchResult(final SearchResult result) {
        if (result.getType().getURI().toString().equals(EagleIOntConstants.IS_DELETED)) {
            return true;
        }
        return false;
    }
    
    /**
     * Indexes the specified SearchResult.
     * @param result SearchResult
     * @param materializeTypes True if the types should be materialized.
     * @throws IOException Thrown if an error is encountered indexing the result
     */
    // TODO change to index EIInstances
    public void indexSearchResult(final SearchResult result, final boolean materializeTypes) throws IOException {
        
        final EIURI uri = result.getEntity().getURI(); 
        
        deleteDocumentByURI(uri);
        
        // if the type of the result is "isDeleted", don't add again
        if (isDeletedSearchResult(result)) {
            return;
        }
        
        // create a Lucene document for the resource
        final Document doc = new Document();

        // index the URI
        doc.add(new Field(URI, uri.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        
        // add the institution URI and label
        final EIEntity institutionEntity = result.getInstitution();
        doc.add(new Field(INSTITUTION_URI, institutionEntity.getURI().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field(INSTITUTION_LABEL, institutionEntity.getLabel(), Field.Store.YES, Field.Index.ANALYZED));        
        
        // index the institution label
        doc.add(new Field(TEXT, institutionEntity.getLabel(), Field.Store.YES, Field.Index.ANALYZED));

        final EIEntity typeEntity = result.getType();
        
        // is this an eagle-i resource?
        final EIClass typeClass = eagleiOntModel.getClass(typeEntity.getURI());
        if (typeClass == null) { 
            logger.error("Resource " + result.getEntity() + " with type " + typeClass + " is not a valid eagle-i class");
            return;
        }
        doc.add(new Field(RESOURCE_FLAG, String.valueOf(typeClass.isEagleIResource()), Field.Store.YES, Field.Index.NOT_ANALYZED));   
        
        // index the type
        // TODO handle multiple asserted types 
        doc.add(new Field(RDF.type.getURI() + OBJECT_URI_POSTFIX, typeEntity.getURI().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));

        // add the types (potentially with materialization)
        for (EIClass type : AbstractGenerator.getTypes(eagleiOntModel, typeEntity.getURI(), materializeTypes)) {
            final String typeURI = type.getEntity().getURI().toString();
            doc.add(new Field(INFERRED_TYPE, typeURI, Field.Store.YES, Field.Index.NOT_ANALYZED));
            // add the label to the text field
            final String label = eagleiOntModel.getPreferredLabel(type.getEntity().getURI());            
            if (label != null) {
                String fieldName = TEXT;
                float boost = STANDARD_BOOST;
                if (typeURI.equals(typeEntity.getURI().toString())) {
                    // boost the direct type
                    fieldName = PREF_TEXT;
                    boost = HIGH_BOOST;
                }
                Field field = new Field(fieldName, label, Field.Store.YES, Field.Index.ANALYZED);
                field.setBoost(boost);
                doc.add(field);
            }
        }
        
        // index each of the data type properties
        for (EIURI propURI: result.getDataTypeProperties()) {
            if (shouldIgnore(typeClass, propURI)) {
                //logger.info("Ignoring " + propURI);
                continue;
            }
            
            String fieldName = TEXT;
            float boost = STANDARD_BOOST;
            if (prefLabelProperties.contains(propURI)) {
                // add preferred label properties to the pref_text field
                fieldName = PREF_TEXT;
                // give a high boost
                boost = HIGH_BOOST;
            }
            
            final Set<String> values = result.getDataTypeProperty(propURI);
            for (String value: values) {
                // index the property value using the URI
                doc.add(new Field(propURI.toString(), value.toString(), Field.Store.YES, Field.Index.ANALYZED));
                
                // add literal props to the text field
                Field field = new Field(fieldName, ((String) value), Field.Store.YES, Field.Index.ANALYZED);
                field.setBoost(boost);
                doc.add(field);
            }
        }
        
        // index each of the object properties
        for (EIURI propURI: result.getObjectProperties()) {
            if (shouldIgnore(typeClass, propURI)) {
                //logger.info("Ignoring " + propURI);
                continue;
            }            
            final Set<EIURI> values = result.getObjectProperty(propURI);
            for (EIURI value: values) {
                // index the property value using the URI
                doc.add(new Field(propURI.toString() + OBJECT_URI_POSTFIX, value.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                
                // add it to the related field
                // TODO boost high-value properties
                doc.add(new Field(RELATED, value.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));                
            }            
        }

        // add the document to the to the index
        this.iwriter.addDocument(doc);
    }
    
    /*
     * Checks if the specified property URI for the specified class should be ignored.
     */
    private boolean shouldIgnore(EIClass typeClass, EIURI uri) {
        // the repository is now excluding these in the /harvest API
        return false;
        // TODO can replace with this when JenaEIOntModel.getProperties() supports annotation props
        /*
        logger.debug("Checking property: " + uri);
        List<EIProperty> props = ((JenaEIOntModel) eagleiOntModel).getProperties(typeClass.getEntity().getURI(), EagleIOntConstants.ADMIN_DATA_GROUP);
        for (EIProperty prop: props) {
            logger.debug("Admin data property: " + prop);
            if (prop.getEntity().getURI().equals(uri)) {
                return true;
            }
        }
        return false;
        */
        //return EagleIOntUtils.isPropertyInGroup(((JenaEIOntModel) eagleiOntModel).getOntModel(), EagleIOntConstants.ADMIN_DATA_GROUP, uri.toString());
    }
}
