package org.eaglei.repository.servlet;

import java.io.File;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Set;
import java.util.TimeZone;
import javax.xml.datatype.XMLGregorianCalendar;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.ServletException;

import org.apache.log4j.Logger;
import org.apache.log4j.LogManager;

import org.openrdf.query.BindingSet;
import org.openrdf.query.impl.DatasetImpl;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.MalformedQueryException;
import org.openrdf.query.TupleQueryResultHandlerBase;
import org.openrdf.query.TupleQueryResultHandlerException;
import org.openrdf.query.TupleQuery;
import org.openrdf.query.GraphQuery;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.impl.BooleanLiteralImpl;
import org.openrdf.model.BNode;
import org.openrdf.model.URI;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.RDFS;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;
import org.openrdf.OpenRDFException;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.Rio;
import org.openrdf.query.resultio.TupleQueryResultFormat;
import org.openrdf.query.resultio.QueryResultIO;
import org.openrdf.query.TupleQueryResultHandler;
import org.openrdf.query.impl.MapBindingSet;

import org.jdom.Namespace;
import org.jdom.Element;
import org.jdom.Document;
import org.jdom.DocType;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.transform.XSLTransformer;
import org.jdom.transform.XSLTransformException;

import org.eaglei.repository.Access;
import org.eaglei.repository.DataRepository;
import org.eaglei.repository.Formats;
import org.eaglei.repository.NamedGraph;
import org.eaglei.repository.Provenance;
import org.eaglei.repository.View;
import org.eaglei.repository.status.BadRequestException;
import org.eaglei.repository.status.HttpStatusException;
import org.eaglei.repository.status.NotFoundException;
import org.eaglei.repository.status.InternalServerErrorException;
import org.eaglei.repository.util.Utils;
import org.eaglei.repository.util.SPARQL;
import org.eaglei.repository.vocabulary.DATAMODEL;
import org.eaglei.repository.vocabulary.DCTERMS;
import org.eaglei.repository.vocabulary.REPO;

/**
 * Harvest service - retrieves listing of eagle-i resource instances in order
 * to "harvest" metadata to build and maintain external indexes.
 * The optional "from" argument helps the client maintain an index efficiently
 * by reporting ONLY the resource instances which have changed from the
 * indicated timestamp onward, e.g. since the last index update.  The most
 * common case, where nothing has changed since the last update, is
 * heavily optimized to avoid querying the triplestore at all.
 *
 * When given time bounds, /harvest reports not only which resources have
 * been updated, but also on resources which have been deleted or otherwise
 * removed from view (e.g. withdrawn).
 *
 * Query Args:
 *   format - override content negotiation with this MIME type
 *   view - use this view as source of RDF data; default is 'user'
 *   workspace - URI of workspace, mutually excl with view
 *   from - optional start date
 *   detail=(identifier|full) - default is full
 *
 * @author Larry Stone
 * @version $Id: $
 * Started June 2010
 */
public class Harvest extends RepositoryServlet
{
    private static Logger log = LogManager.getLogger(Harvest.class);

    // values of 'detail' arg
    public enum DetailArg { identifier, full };

    // URI prefix to indicate a deleted URI; just append the whole URI.
    private static final String DELETED_PREFIX = "info:/deleted#";

    private static final String column[] = { "subject", "predicate", "object"};
    private static List<String> columnNames = null;
    static {
        columnNames = Arrays.asList(column);
    }

    // query to get *all* resources at detail=identifier
    private static final String detailIdNoTimeQuery =
        "SELECT DISTINCT ?subject WHERE \n"+
        "{ ?subject a ?typ}";

    // query to get *all* resources at detail=full
    // all the misery with ?graph is to get the instances on a specific
    // published/workspace graph along with JUST THEIR inferred types.
    // Need to bind: ?graph
    private static final String detailFullNoTimeQuery =
        "SELECT ?subject ?predicate ?object WHERE \n"+
        "{{GRAPH ?graph {?subject a ?typ . GRAPH <"+REPO.NG_INFERRED+"> {?subject ?predicate ?object}}}\n"+
        " UNION {GRAPH ?graph {?subject a ?typ; ?predicate ?object \n"+
        "  OPTIONAL { GRAPH <"+DATAMODEL.GRAPH_NAME+"> { ?predicate <"+DATAMODEL.IN_PROPERTY_GROUP+"> ?pgad \n"+
        "    FILTER(?pgad = <"+DATAMODEL.PROPERTY_GROUP_ADMIN_DATA+">)}}\n"+
        "  FILTER(!(BOUND(?pgad))) }}} ORDER BY ?subject";

    // First portion of queries to get DELETED mod-time-bounded
    // resources at detail=identifier.  See versions below with and without
    // Withdrawn to accomodate workspace=:NG_Withdrawn.
    // Need to bind: ?from
    private static final String deletedFromTimeQueryProlog =
        "SELECT DISTINCT ?subject WHERE \n"+
        "{ GRAPH <"+REPO.NG_METADATA+"> { ?subject <"+DCTERMS.MODIFIED+"> ?mod}\n"+
        " FILTER(?mod >= ?from)\n"+
        " OPTIONAL{ GRAPH ?g {?subject a ?t}} \n";

    // Get both deleted AND withdrawn instances in time-bound query
    // Need to bind: ?from
    private static final String deletedAndWithdrawnFromTimeQuery =
        deletedFromTimeQueryProlog +
        "  FILTER(!bound(?t) || ?g = <"+REPO.NG_WITHDRAWN+">) }\n"+
        "ORDER BY ?subject";

    // Get ONLY deleted instances in time-bound query (NOT withdrawn)
    // Need to bind: ?from
    private static final String deletedNotWithdrawnFromTimeQuery =
        deletedFromTimeQueryProlog +
        "  FILTER(!bound(?t)) }\n"+
        "ORDER BY ?subject";

    // query to get mod-time-bounded resources at detail=identifier
    // Need to bind: ?graph, ?from
    private static final String identifierFromTimeQuery =
        "SELECT DISTINCT ?subject WHERE \n"+
        "{ GRAPH <"+REPO.NG_METADATA+"> { ?subject <"+DCTERMS.MODIFIED+"> ?mod}\n"+
        " FILTER( ?mod >= ?from )"+
        " GRAPH ?graph {?subject a ?type}} ORDER BY ?mod";


    // query to get mod-time-bounded resources at detail=full
    // Need to bind: ?graph, ?from
    private static final String fullFromTimeQuery =
        "SELECT DISTINCT ?subject ?predicate ?object WHERE \n"+
        "{ GRAPH <"+REPO.NG_METADATA+"> { ?subject <"+DCTERMS.MODIFIED+"> ?mod}\n"+
        " FILTER( ?mod >= ?from )"+
        " {{GRAPH <"+REPO.NG_INFERRED+"> {?subject ?predicate ?object}} UNION\n"+
        "  {GRAPH ?graph {?subject a ?type; ?predicate ?object} \n"+
        "   OPTIONAL { GRAPH <"+DATAMODEL.GRAPH_NAME+"> { ?predicate <"+DATAMODEL.IN_PROPERTY_GROUP+"> ?pgad \n"+
        "     FILTER(?pgad = <"+DATAMODEL.PROPERTY_GROUP_ADMIN_DATA+">)}}\n"+
        "   FILTER(!(BOUND(?pgad))) }}} ORDER BY ?subject";

    /** {@inheritDoc} */
    protected void doPost(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, java.io.IOException
    {
        doGet(request, response);
    }

    /** {@inheritDoc} */
    protected void doGet(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, java.io.IOException
    {
        request.setCharacterEncoding("UTF-8");
        request.setCharacterEncoding("UTF-8");
        String format = request.getParameter("format");
        String rawview = request.getParameter("view");
        String rawws = request.getParameter("workspace");
        boolean inferred = Utils.parseBooleanParameter(request.getParameter("inferred"), "inferred", false, false);
        String rawfrom = request.getParameter("from");
        String rawdetail = request.getParameter("detail");
        View view = null;

        // sanity check - 'until' not impl. yet
        if (request.getParameter("until") != null)
            throw new HttpStatusException(HttpServletResponse.SC_NOT_IMPLEMENTED, "The 'until' arg is not implemented yet.");

        DetailArg detail = (DetailArg)Utils.parseKeywordArg(DetailArg.class, rawdetail, "detail", true, null);

        // sanity-check from
        XMLGregorianCalendar from = null;
        if (rawfrom != null)
            from = Utils.parseXMLDate(rawfrom);

        // sanity check - 'inferred' not allowed here
        if (detail == DetailArg.identifier && inferred)
            throw new BadRequestException("The 'inferred' arg is not allowed when detail = identifier.");

        // sanity check: cannot specify view and workspace
        if (rawws != null && rawview != null)
            throw new BadRequestException("The 'view' and 'workspace' arguments are mutually exclusive.  Choose only one.");
        URI workspace = Utils.parseURI(rawws, "workspace", false);

        long startMs = System.currentTimeMillis();
        boolean didQuery = false;
        try {
            // 'out' is final output handler - chosen by 'format' arg
            String mimeType = Formats.negotiateTupleContent(request, format);
            TupleQueryResultFormat tqf = QueryResultIO.getWriterFormatForMIMEType(mimeType);
            if (tqf == null) {
                throw new InternalServerErrorException("Failed to get tuple query format that SHOULD have been available, for mime="+mimeType);
            }
            response.setContentType(Utils.makeContentType(mimeType, "UTF-8"));
            TupleQueryResultHandler out = QueryResultIO.createWriter(tqf, response.getOutputStream());

            // Set last-modified header to repo's last-mod time so
            // client can gauge their next incremental request from this time
            Date lm = DataRepository.getInstance().getLastModified();
            response.addDateHeader("Last-Modified", lm.getTime());

            // Optimization: if nothing has changed since the 'from' time,
            // return an empty document.
            if (from != null) {
                if (lm.before(from.toGregorianCalendar().getTime())) {
                    log.debug("Optimizing result since last-modified mark is earlier than from: last-mod = "+lm);
                    out.startQueryResult(columnNames);
                    out.endQueryResult();
                    return;
                } else
                    log.debug("Going ahead with query, last-modified mark is after 'from': last-mod = "+lm);
            }

            RepositoryConnection rc = WithRepositoryConnection.get(request);
            ValueFactory vf = rc.getValueFactory();

            // Construct a dataset for objects to report on from
            // workspace or view, published resource graphs by default.
            DatasetImpl ds = new DatasetImpl();
            if (workspace != null) {
                View.addWorkspaceGraphs(request, ds, workspace);
            } else {
                view = (rawview == null) ? View.PUBLISHED_RESOURCES :
                                           View.parseView(rawview);
                if (view == null)
                    throw new BadRequestException("Unknown view: "+rawview);
                View.addGraphs(request, ds, view);
            }
            //  Use prettyPrint because ds.toString() breaks when a default graph is null
            //  which is possible for the 'null' view.  don't ask, it's ugly.
            if (log.isDebugEnabled())
                log.debug("Dataset derived from initial 'view' or 'workspace' args = "+Utils.prettyPrint(ds));

            // Now, further filter that dataset down to just Published and Workspace graphs
            DatasetImpl resDS = new DatasetImpl();
            for (URI g : ds.getDefaultGraphs()) {
                // XXX KLUDGE: skip NG_Users graph, we don't want repo-users.
                //  unfortunately, it comes with Workspace datasets too.
                if (REPO.NG_USERS.equals(g))
                    continue;
                NamedGraph ng = NamedGraph.find(request, g);
                NamedGraph.Type ngt = ng == null ? null : ng.getType();
                if (ngt == NamedGraph.Type.published || ngt == NamedGraph.Type.workspace)
                    SPARQL.addGraph(resDS, g);
            }

            // no time bounds: get *all* resource identifiers and/or data:
            didQuery = true;
            if (from == null) {
                if (log.isDebugEnabled())
                    log.debug("Resource Dataset = "+Utils.prettyPrint(resDS));
                if (detail == DetailArg.identifier) {
                    log.debug("HARVEST QUERY NO TIME (detail="+detail+") = \n"+detailIdNoTimeQuery);
                    TupleQuery q = rc.prepareTupleQuery(QueryLanguage.SPARQL,  detailIdNoTimeQuery);
                    q.setDataset(resDS);
                    q.setIncludeInferred(false);
                    q.evaluate(out);

                // 'full' pattern is more complex, since we have to query
                // each graph of interest separately; get the graphs
                // from resource dataset (BEFORE adding inferred)
                } else {
                    Set<URI> dg = resDS.getDefaultGraphs();
                    URI resGraphs[] = dg.toArray(new URI[dg.size()]);
                    if (inferred)
                        SPARQL.addGraph(resDS, REPO.NG_INFERRED);
                    // need datamodel for property test in query..
                    SPARQL.addGraph(resDS, DATAMODEL.GRAPH_NAME_URI);
                    if (log.isDebugEnabled())
                        log.debug("HARVEST QUERY NO TIME (detail="+detail+") = \n"+detailFullNoTimeQuery);
                    TupleQuery q = rc.prepareTupleQuery(QueryLanguage.SPARQL, detailFullNoTimeQuery);
                    q.setDataset(resDS);
                    q.setIncludeInferred(inferred);
                    out.startQueryResult(columnNames);
                    for (URI graph : resGraphs) {
                        if (log.isDebugEnabled())
                            log.debug("Getting FULL Resource Instances from graph="+graph);
                        q.clearBindings();
                        q.setBinding("graph", graph);
                        q.evaluate(new wrappedHandler(out));
                    }
                    out.endQueryResult();
                }

            // get results for args WITH a ?from limit:
            } else {

                // First, query for URIs of Deleted and Withdrawn resources;
                // need to construct a dataset of *ALL* workspace and published
                // graphs to prevent false-positive delete notifications.
                // XXX NOTE This violates access control but it leakes *less*
                // data this way, should not be a problem.
                DatasetImpl dds = new DatasetImpl();
                for (NamedGraph ng : NamedGraph.findAll(request)) {
                    NamedGraph.Type ngt = ng.getType();
                    if (log.isDebugEnabled())
                        log.debug("for DDS, Trying NamedGraph type="+ngt+", name="+ng.getName());
                    if (ngt == NamedGraph.Type.published ||
                        ngt == NamedGraph.Type.workspace ||
                        ngt == NamedGraph.Type.metadata)
                        SPARQL.addGraph(dds, ng.getName());
                }

                // choose query based on whether NG_Withdrawn is the workspace:
                String qs =  REPO.NG_WITHDRAWN.equals(workspace) ?
                             deletedNotWithdrawnFromTimeQuery :
                             deletedAndWithdrawnFromTimeQuery;
                if (log.isDebugEnabled()) {
                    log.debug("Resource DELETE Dataset = "+Utils.prettyPrint(dds));
                    log.debug("HARVEST QUERY DELETED/WITHDRAWN (detail="+detail+") = \n"+qs);
                }
                TupleQuery q = rc.prepareTupleQuery(QueryLanguage.SPARQL, qs);
                q.setDataset(dds);
                q.setIncludeInferred(false);
                q.setBinding("from", vf.createLiteral(from));
                out.startQueryResult(detail == DetailArg.identifier ?
                                 columnNames.subList(0,1) : columnNames);
                q.evaluate(new deletedHandler(out, detail));

                // Now get the recently-modified resource instances.
                // Requires adding Metadata graph to resDS dataset, but first
                // save the original resDS graphs to iterate:
                Set<URI> dg = resDS.getDefaultGraphs();
                URI resGraphs[] = dg.toArray(new URI[dg.size()]);
                SPARQL.addGraph(resDS, REPO.NG_METADATA);
                if (inferred)
                    SPARQL.addGraph(resDS, REPO.NG_INFERRED);
                // need datamodel for property test in query..
                if (detail == DetailArg.full)
                    SPARQL.addGraph(resDS, DATAMODEL.GRAPH_NAME_URI);
                q = rc.prepareTupleQuery(QueryLanguage.SPARQL,
                      detail == DetailArg.identifier ? identifierFromTimeQuery : fullFromTimeQuery);
                q.setDataset(resDS);
                q.setIncludeInferred(inferred);
                Literal lfrom = vf.createLiteral(from);
                for (URI graph : resGraphs) {
                    if (log.isDebugEnabled())
                        log.debug("Getting non-deleted Resource Instances mod. since="+from+", from graph="+graph);
                    q.clearBindings();
                    q.setBinding("from", lfrom);
                    q.setBinding("graph", graph);
                    q.evaluate(new wrappedHandler(out));
                }
                out.endQueryResult();
            }
        } catch (OpenRDFException e) {
            log.error(e);
            throw new ServletException(e);
        } finally {
            // XXX Maybe make this log.debug if polling bloats logs..
            // but it shouldn't, since most /harvest calls with from=X
            // will be optimized out without a SPARQL query and it is
            // valuable to have this in the log to watch for excessive
            // query overhead.
            if (didQuery)
                log.info("SPARQL query for /harvest request completed in "+
                         String.format("%,d mSec.", System.currentTimeMillis()-startMs));
        }
    }

    // transform query results for query that returns DELETED subjects
    private static class deletedHandler extends TupleQueryResultHandlerBase
    {
        private TupleQueryResultHandler defer = null;
        private DetailArg detail = null;
        private MapBindingSet bs = null;

        public deletedHandler(TupleQueryResultHandler d, DetailArg dt)
        {
            super();
            defer = d;
            detail = dt;
        }

        public void startQueryResult(List<String> bn)
            throws TupleQueryResultHandlerException
        {
            List<String> cols = detail == DetailArg.identifier ?
                                 columnNames.subList(0,1) : columnNames;
            bs = new MapBindingSet(cols.size());
        }

        public void handleSolution(BindingSet nbs)
            throws TupleQueryResultHandlerException
        {
            Value sub = nbs.getValue("subject");
            if (!(sub instanceof BNode)) {
                bs.clear();
                log.debug("deletedHandler.handleSolution: Got result subject="+sub);
                if (detail == DetailArg.identifier)
                    bs.addBinding(column[0], new URIImpl(DELETED_PREFIX+sub.stringValue()));
                else {
                    bs.addBinding(column[0], sub);
                    bs.addBinding(column[1], REPO.IS_DELETED);
                    bs.addBinding(column[2], BooleanLiteralImpl.TRUE);
                }
                defer.handleSolution(bs);
            }
        }
    }

    // wrap a tuple handler and skip its start/stop methods
    private static class wrappedHandler extends TupleQueryResultHandlerBase
    {
        private TupleQueryResultHandler defer = null;

        public wrappedHandler(TupleQueryResultHandler d)
        {
            super();
            defer = d;
        }

        public void handleSolution(BindingSet bs)
            throws TupleQueryResultHandlerException
        {
            defer.handleSolution(bs);
        }
    }
}
