net.sf.taverna.t2.provenance.lineageservice
Class ProvenanceQuery

java.lang.Object
  extended by net.sf.taverna.t2.provenance.lineageservice.ProvenanceQuery

public abstract class ProvenanceQuery
extends java.lang.Object

Handles all the querying of provenance items in the database layer. Uses standard SQL so all specific instances of this class can extend this writer to handle all of the db queries

Author:
Paolo Missier, Ian Dunlop, Stuart Owen

Field Summary
static java.lang.String DATAFLOW_TYPE
           
protected  org.apache.log4j.Logger logger
           
 
Constructor Summary
ProvenanceQuery()
           
 
Method Summary
protected  java.lang.String addOrderByToQuery(java.lang.String q0, java.util.List<java.lang.String> orderAttr, boolean terminate)
           
protected  java.lang.String addWhereClauseToQuery(java.lang.String q0, java.util.Map<java.lang.String,java.lang.String> queryConstraints, boolean terminate)
          implements a set of query constraints of the form var = value into a WHERE clause
 java.sql.Statement execQuery(java.lang.String q)
          pass-through query method
 LineageSQLQuery generateSQL(java.lang.String wfInstance, java.lang.String proc, java.lang.String effectivePath, boolean returnOutputs)
          if effectivePath is not null: query varBinding using: wfInstanceRef = wfInstance, iteration = effectivePath, PNameRef = proc if input vars is null, then use the output var this returns the bindings for the set of input vars at the correct iteration if effectivePath is null: fetch VarBindings for all input vars, without constraint on the iteration
added outer join with Data
additionally, try querying the collection table first -- if the query succeeds, it means the path is pointing to an internal node in the collection, and we just got the right node.
protected  LineageSQLQuery generateSQL2(java.lang.String wfInstance, java.lang.String proc, java.lang.String var, java.lang.String path, boolean returnInput)
           
 java.util.List<java.lang.String> getAllWFnames()
           
 java.util.ArrayList<Workflow> getAllWorkflowRecords(java.util.ArrayList<java.lang.String> workflowIDList)
           
 java.util.List<Arc> getArcs(java.util.Map<java.lang.String,java.lang.String> queryConstraints)
          selects all Arcs
 java.util.List<java.lang.String> getChildrenOfWorkflow(java.lang.String parentWFName)
           
 java.util.List<Collection> getCollectionsForRun(java.lang.String wfInstanceID)
           
 java.sql.Connection getConnection()
           
 java.util.List<java.lang.String> getContainedProcessors(java.lang.String dataflowName)
          returns the set of all processors that are structurally contained within the wf corresponding to the input dataflow name
 java.lang.String getContainingCollection(LineageQueryResultRecord record)
           
 java.util.List<Workflow> getContainingWorkflowsForProcessor(java.lang.String pname)
           
 java.lang.String getDataValue(java.lang.String valueRef)
           
 java.util.List<Var> getInputVars(java.lang.String pname, java.lang.String wfID, java.lang.String wfInstanceID)
          return the input variables for a given processor and a wfInstanceId
 java.lang.String getLatestRunID()
           
 java.util.List<NestedListNode> getNestedListNodes(java.util.Map<java.lang.String,java.lang.String> constraints)
           
 java.util.List<Var> getOutputVars(java.lang.String pname, java.lang.String wfID, java.lang.String wfInstanceID)
          return the output variables for a given processor and a wfInstanceId
 java.lang.String getParentOfWorkflow(java.lang.String childWFName)
          fetch children of parentWFName from the Workflow table
 java.util.Map<java.lang.String,java.lang.Integer> getPredecessorsCount(java.lang.String wfInstanceID)
           
 java.util.Map<java.lang.String,java.lang.Integer> getPredecessorsCountOld(java.lang.String wfInstanceID)
          new impl of getProcessorsIncomingLinks whicih avoids complications due to nesting, and relies on the wfInstanceID rather than the wfnameRef
 java.util.List<ProcBinding> getProcBindings(java.util.Map<java.lang.String,java.lang.String> constraints)
          all ProCBinding records that satisfy the input constraints
 java.util.List<ProvenanceProcessor> getProcessors(java.util.Map<java.lang.String,java.lang.String> constraints)
          generic method to fetch processors subject to additional query constraints
 java.util.Map<java.lang.String,java.util.List<ProvenanceProcessor>> getProcessorsDeep(java.lang.String type, java.lang.String wfnameRef)
          this is similar to getProcessorsShallow(String, String) but it recursively fetches all processors within nested workflows.
 java.util.List<ProvenanceProcessor> getProcessorsForWorkflow(java.lang.String workflowID)
           
 java.util.Map<java.lang.String,java.lang.Integer> getProcessorsIncomingLinks(java.lang.String wfnameRef)
          used in the toposort phase -- propagation of anl() values through the graph
 java.util.List<ProvenanceProcessor> getProcessorsShallow(java.lang.String type, java.lang.String wfnameRef)
          get all processors of a given type within a structure identified by wfnameRef (reference to dataflow).
 java.util.List<WorkflowInstance> getRuns(java.lang.String dataflowID, java.util.Map<java.lang.String,java.lang.String> conditions)
           
 java.util.List<java.lang.String> getSuccProcessors(java.lang.String pName, java.lang.String wfNameRef, java.lang.String wfInstanceId)
           
 java.util.List<Var> getSuccVars(java.lang.String pName, java.lang.String vName, java.lang.String wfInstanceRef)
           
 java.lang.String getTopDataflow(java.lang.String wfInstanceID)
           
 java.lang.String getTopLevelDataflowName(java.lang.String wfInstanceID)
           
 java.lang.String getTopLevelWfName(java.lang.String runID)
           
 java.util.List<VarBinding> getVarBindings(java.util.Map<java.lang.String,java.lang.String> constraints)
          TODO this currently returns the data value as a string, which is incorrect as it is an untyped byte array
 java.util.List<Var> getVars(java.util.Map<java.lang.String,java.lang.String> queryConstraints)
          select Var records that satisfy constraints
 java.util.List<java.lang.String> getVarValues(java.lang.String wfInstance, java.lang.String pname, java.lang.String vname)
           
 Workflow getWfFromDataflowID(java.lang.String dataflowID)
           
 java.lang.String getWfNameForDataflow(java.lang.String dataflowName)
          returns the internal ID of a dataflow given its external name
 java.util.List<java.lang.String> getWfNames(java.lang.String runID)
          returns the names of all workflows (top level + nested) for a given runID
 java.util.List<java.lang.String> getWFNamesByTime()
           
 Workflow getWorkflow(java.lang.String dataflowID)
          returns a Workflow record from the DB given the workflow internal ID
 java.util.List<Workflow> getWorkflowForRun(java.lang.String runID)
          returns the workflows associated to a single runID
 boolean isDataflow(java.lang.String procName)
           
 boolean isRootProcessorOfWorkflow(java.lang.String procName, java.lang.String wfName, java.lang.String wfInstanceId)
           
 boolean isTopLevelDataflow(java.lang.String wfNameID)
           
 java.util.List<LineageSQLQuery> lineageQueryGen(java.lang.String wfInstanceID, java.lang.String proc, java.util.Map<Var,java.lang.String> var2Path, Var outputVar, java.lang.String path, boolean returnOutputs)
          if var2Path is null this generates a trivial query for the current output var and current path
 java.util.Set<DDRecord> queryAllFromValues(java.lang.String wfInstance)
           
 java.util.Set<DDRecord> queryArcsForDD(java.lang.String p, java.lang.String v, java.lang.String val, java.lang.String wfInstance)
           
 java.util.List<DDRecord> queryDD(java.lang.String p, java.lang.String var, java.lang.String value, java.lang.String iteration, java.lang.String wfInstance)
           
 org.jdom.Document recordsToCollection(Dependencies lqr)
          takes an ordered set of records for the same variable with iteration indexes and builds a collection out of it
 Dependencies runCollectionQuery(LineageSQLQuery lq)
           
 java.util.List<Dependencies> runLineageQueries(java.util.List<LineageSQLQuery> lqList, boolean includeDataValue)
           
 Dependencies runLineageQuery(LineageSQLQuery lq, boolean includeDataValue)
          executes one of the lineage queries produced by the graph visit algorithm.
 Dependencies runVBQuery(LineageSQLQuery lq, boolean includeDataValue)
           
 LineageSQLQuery simpleLineageQuery(java.lang.String wfInstance, java.lang.String wfNameRef, java.lang.String pname, java.lang.String vname, java.lang.String iteration)
          simplest possible pinpoint query.
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

logger

protected org.apache.log4j.Logger logger

DATAFLOW_TYPE

public static java.lang.String DATAFLOW_TYPE
Constructor Detail

ProvenanceQuery

public ProvenanceQuery()
Method Detail

getConnection

public java.sql.Connection getConnection()
                                  throws java.lang.InstantiationException,
                                         java.lang.IllegalAccessException,
                                         java.lang.ClassNotFoundException,
                                         java.sql.SQLException
Throws:
java.lang.InstantiationException
java.lang.IllegalAccessException
java.lang.ClassNotFoundException
java.sql.SQLException

addWhereClauseToQuery

protected java.lang.String addWhereClauseToQuery(java.lang.String q0,
                                                 java.util.Map<java.lang.String,java.lang.String> queryConstraints,
                                                 boolean terminate)
implements a set of query constraints of the form var = value into a WHERE clause

Parameters:
q0 -
queryConstraints -
Returns:

addOrderByToQuery

protected java.lang.String addOrderByToQuery(java.lang.String q0,
                                             java.util.List<java.lang.String> orderAttr,
                                             boolean terminate)

execQuery

public java.sql.Statement execQuery(java.lang.String q)
                             throws java.lang.InstantiationException,
                                    java.lang.IllegalAccessException,
                                    java.lang.ClassNotFoundException,
                                    java.sql.SQLException
pass-through query method

Parameters:
q - valid JDBC query string for the T2provenance schema
Returns:
the executed Statement if successull, null otherwise
Throws:
java.lang.InstantiationException
java.lang.IllegalAccessException
java.lang.ClassNotFoundException
java.sql.SQLException

getVars

public java.util.List<Var> getVars(java.util.Map<java.lang.String,java.lang.String> queryConstraints)
                            throws java.sql.SQLException
select Var records that satisfy constraints

Throws:
java.sql.SQLException

getVarValues

public java.util.List<java.lang.String> getVarValues(java.lang.String wfInstance,
                                                     java.lang.String pname,
                                                     java.lang.String vname)
                                              throws java.sql.SQLException
Throws:
java.sql.SQLException

getInputVars

public java.util.List<Var> getInputVars(java.lang.String pname,
                                        java.lang.String wfID,
                                        java.lang.String wfInstanceID)
                                 throws java.sql.SQLException
return the input variables for a given processor and a wfInstanceId

Parameters:
pname -
wfInstanceId -
Returns:
list of input variables
Throws:
java.sql.SQLException

getOutputVars

public java.util.List<Var> getOutputVars(java.lang.String pname,
                                         java.lang.String wfID,
                                         java.lang.String wfInstanceID)
                                  throws java.sql.SQLException
return the output variables for a given processor and a wfInstanceId

Parameters:
pname -
wfInstanceId -
Returns:
list of output variables
Throws:
java.sql.SQLException

getArcs

public java.util.List<Arc> getArcs(java.util.Map<java.lang.String,java.lang.String> queryConstraints)
                            throws java.sql.SQLException
selects all Arcs

Parameters:
queryConstraints -
Returns:
Throws:
java.sql.SQLException

getTopLevelWfName

public java.lang.String getTopLevelWfName(java.lang.String runID)
                                   throws java.sql.SQLException
Throws:
java.sql.SQLException

getWfNames

public java.util.List<java.lang.String> getWfNames(java.lang.String runID)
                                            throws java.sql.SQLException
returns the names of all workflows (top level + nested) for a given runID

Parameters:
runID -
Returns:
Throws:
java.sql.SQLException

getWorkflowForRun

public java.util.List<Workflow> getWorkflowForRun(java.lang.String runID)
                                           throws java.sql.SQLException
returns the workflows associated to a single runID

Parameters:
runID -
Returns:
Throws:
java.sql.SQLException

getLatestRunID

public java.lang.String getLatestRunID()
                                throws java.sql.SQLException
Throws:
java.sql.SQLException

getRuns

public java.util.List<WorkflowInstance> getRuns(java.lang.String dataflowID,
                                                java.util.Map<java.lang.String,java.lang.String> conditions)
                                         throws java.sql.SQLException
Parameters:
dataflowID -
conditions - currently only understands "from" and "to" as timestamps for range queries
Returns:
Throws:
java.sql.SQLException

getWFNamesByTime

public java.util.List<java.lang.String> getWFNamesByTime()
                                                  throws java.sql.SQLException
Throws:
java.sql.SQLException

getProcBindings

public java.util.List<ProcBinding> getProcBindings(java.util.Map<java.lang.String,java.lang.String> constraints)
                                            throws java.sql.SQLException
all ProCBinding records that satisfy the input constraints

Parameters:
constraints -
Returns:
Throws:
java.sql.SQLException

getVarBindings

public java.util.List<VarBinding> getVarBindings(java.util.Map<java.lang.String,java.lang.String> constraints)
                                          throws java.sql.SQLException
TODO this currently returns the data value as a string, which is incorrect as it is an untyped byte array

Parameters:
constraints - a Map columnName -> value that defines the query constraints. Note: columnName must be fully qualified. This is not done well at the moment, i.e., PNameRef should be VarBinding.PNameRef to avoid ambiguities
Returns:
Throws:
java.sql.SQLException

getNestedListNodes

public java.util.List<NestedListNode> getNestedListNodes(java.util.Map<java.lang.String,java.lang.String> constraints)
                                                  throws java.sql.SQLException
Throws:
java.sql.SQLException

getPredecessorsCount

public java.util.Map<java.lang.String,java.lang.Integer> getPredecessorsCount(java.lang.String wfInstanceID)

getPredecessorsCountOld

public java.util.Map<java.lang.String,java.lang.Integer> getPredecessorsCountOld(java.lang.String wfInstanceID)
new impl of getProcessorsIncomingLinks whicih avoids complications due to nesting, and relies on the wfInstanceID rather than the wfnameRef

Parameters:
wfInstanceID -
Returns:

getProcessorsIncomingLinks

public java.util.Map<java.lang.String,java.lang.Integer> getProcessorsIncomingLinks(java.lang.String wfnameRef)
                                                                             throws java.sql.SQLException
used in the toposort phase -- propagation of anl() values through the graph

Parameters:
wfnameRef - reference to static wf name
Returns:
a map --> for each processor, without counting the arcs from the dataflow input to processors. So a processor is at the root of the graph if it has no incoming links, or all of its incoming links are from dataflow inputs.
Note: this must be checked for processors that are roots of sub-flows... are these counted as top-level root nodes??
Throws:
java.sql.SQLException

getSuccVars

public java.util.List<Var> getSuccVars(java.lang.String pName,
                                       java.lang.String vName,
                                       java.lang.String wfInstanceRef)
                                throws java.sql.SQLException
Throws:
java.sql.SQLException

getSuccProcessors

public java.util.List<java.lang.String> getSuccProcessors(java.lang.String pName,
                                                          java.lang.String wfNameRef,
                                                          java.lang.String wfInstanceId)
                                                   throws java.sql.SQLException
Throws:
java.sql.SQLException

getProcessorsShallow

public java.util.List<ProvenanceProcessor> getProcessorsShallow(java.lang.String type,
                                                                java.lang.String wfnameRef)
                                                         throws java.sql.SQLException
get all processors of a given type within a structure identified by wfnameRef (reference to dataflow). type constraint is ignored if value is null.
this only returns the processor for the input wfNameRef, without going into any neted workflows

Parameters:
wfnameRef -
type -
Returns:
a list, that contains at most one element
Throws:
java.sql.SQLException

getProcessorsDeep

public java.util.Map<java.lang.String,java.util.List<ProvenanceProcessor>> getProcessorsDeep(java.lang.String type,
                                                                                             java.lang.String wfnameRef)
this is similar to getProcessorsShallow(String, String) but it recursively fetches all processors within nested workflows. The result is collected in the form of a map: wfName -> {ProvenanceProcessor}

Parameters:
type -
wfnameRef -
Returns:
a map: wfName -> {ProvenanceProcessor} where wfName is the name of a (possibly nested) workflow, and the values are the processors within that workflow

getDataValue

public java.lang.String getDataValue(java.lang.String valueRef)

getProcessors

public java.util.List<ProvenanceProcessor> getProcessors(java.util.Map<java.lang.String,java.lang.String> constraints)
                                                  throws java.sql.SQLException
generic method to fetch processors subject to additional query constraints

Parameters:
constraints -
Returns:
Throws:
java.sql.SQLException

getProcessorsForWorkflow

public java.util.List<ProvenanceProcessor> getProcessorsForWorkflow(java.lang.String workflowID)

simpleLineageQuery

public LineageSQLQuery simpleLineageQuery(java.lang.String wfInstance,
                                          java.lang.String wfNameRef,
                                          java.lang.String pname,
                                          java.lang.String vname,
                                          java.lang.String iteration)
simplest possible pinpoint query. Uses iteration info straight away. Assumes result is in VarBinding not in Collection

Parameters:
wfInstance -
pname -
vname -
iteration -
Returns:

lineageQueryGen

public java.util.List<LineageSQLQuery> lineageQueryGen(java.lang.String wfInstanceID,
                                                       java.lang.String proc,
                                                       java.util.Map<Var,java.lang.String> var2Path,
                                                       Var outputVar,
                                                       java.lang.String path,
                                                       boolean returnOutputs)
if var2Path is null this generates a trivial query for the current output var and current path

Parameters:
wfInstanceID -
proc -
var2Path -
outputVar -
path -
returnOutputs - returns inputs *and* outputs if set to true
Returns:

generateSQL2

protected LineageSQLQuery generateSQL2(java.lang.String wfInstance,
                                       java.lang.String proc,
                                       java.lang.String var,
                                       java.lang.String path,
                                       boolean returnInput)

generateSQL

public LineageSQLQuery generateSQL(java.lang.String wfInstance,
                                   java.lang.String proc,
                                   java.lang.String effectivePath,
                                   boolean returnOutputs)
if effectivePath is not null: query varBinding using: wfInstanceRef = wfInstance, iteration = effectivePath, PNameRef = proc if input vars is null, then use the output var this returns the bindings for the set of input vars at the correct iteration if effectivePath is null: fetch VarBindings for all input vars, without constraint on the iteration
added outer join with Data
additionally, try querying the collection table first -- if the query succeeds, it means the path is pointing to an internal node in the collection, and we just got the right node. Otherwise, query VarBinding for the leaves

Parameters:
wfInstance -
proc -
effectivePath -
returnOutputs - returns both inputs and outputs if set to true
Returns:

runCollectionQuery

public Dependencies runCollectionQuery(LineageSQLQuery lq)
                                throws java.sql.SQLException
Throws:
java.sql.SQLException

runVBQuery

public Dependencies runVBQuery(LineageSQLQuery lq,
                               boolean includeDataValue)
                        throws java.sql.SQLException
Parameters:
lq -
includeDataValue - IGNORED. always false
Returns:
Throws:
java.sql.SQLException

runLineageQuery

public Dependencies runLineageQuery(LineageSQLQuery lq,
                                    boolean includeDataValue)
                             throws java.sql.SQLException
executes one of the lineage queries produced by the graph visit algorithm. This first executes the collection query, and then if no result is returned, the varBinding query

Parameters:
lq - a lineage query computed during the graph traversal
includeDataValue - if true, then the referenced value is included in the result. This may only be necessary for testing: the data reference in field value (which is a misleading field name, and actually refers to the data reference) should be sufficient
Returns:
Throws:
java.sql.SQLException

runLineageQueries

public java.util.List<Dependencies> runLineageQueries(java.util.List<LineageSQLQuery> lqList,
                                                      boolean includeDataValue)
                                               throws java.sql.SQLException
Throws:
java.sql.SQLException

recordsToCollection

public org.jdom.Document recordsToCollection(Dependencies lqr)
takes an ordered set of records for the same variable with iteration indexes and builds a collection out of it

Parameters:
lqr -
Returns:
a jdom Document with the collection

getContainedProcessors

public java.util.List<java.lang.String> getContainedProcessors(java.lang.String dataflowName)
returns the set of all processors that are structurally contained within the wf corresponding to the input dataflow name

Parameters:
dataflowName - the name of a processor of type DataFlowActivity
Returns:

getTopLevelDataflowName

public java.lang.String getTopLevelDataflowName(java.lang.String wfInstanceID)

getWfFromDataflowID

public Workflow getWfFromDataflowID(java.lang.String dataflowID)
Parameters:
dataflowID - internal dataflow ID
Returns:
the external name of a dataflow

getWfNameForDataflow

public java.lang.String getWfNameForDataflow(java.lang.String dataflowName)
returns the internal ID of a dataflow given its external name

Parameters:
dataflowName -
instanceID -
Returns:

getChildrenOfWorkflow

public java.util.List<java.lang.String> getChildrenOfWorkflow(java.lang.String parentWFName)
                                                       throws java.sql.SQLException
Throws:
java.sql.SQLException

getParentOfWorkflow

public java.lang.String getParentOfWorkflow(java.lang.String childWFName)
                                     throws java.sql.SQLException
fetch children of parentWFName from the Workflow table

Parameters:
childWFName -
Returns:
Throws:
java.sql.SQLException

getAllWFnames

public java.util.List<java.lang.String> getAllWFnames()
                                               throws java.sql.SQLException
Throws:
java.sql.SQLException

isDataflow

public boolean isDataflow(java.lang.String procName)
                   throws java.sql.SQLException
Parameters:
procName -
Returns:
true if procName is the external name of a dataflow, false otherwise
Throws:
java.sql.SQLException

isTopLevelDataflow

public boolean isTopLevelDataflow(java.lang.String wfNameID)

getTopDataflow

public java.lang.String getTopDataflow(java.lang.String wfInstanceID)

queryDD

public java.util.List<DDRecord> queryDD(java.lang.String p,
                                        java.lang.String var,
                                        java.lang.String value,
                                        java.lang.String iteration,
                                        java.lang.String wfInstance)
                                 throws java.sql.SQLException
Parameters:
p - pTo processor
var - vTo
value - valTo
Returns:
a set of DDRecord
Throws:
java.sql.SQLException

queryArcsForDD

public java.util.Set<DDRecord> queryArcsForDD(java.lang.String p,
                                              java.lang.String v,
                                              java.lang.String val,
                                              java.lang.String wfInstance)
                                       throws java.sql.SQLException
Throws:
java.sql.SQLException

queryAllFromValues

public java.util.Set<DDRecord> queryAllFromValues(java.lang.String wfInstance)
                                           throws java.sql.SQLException
Throws:
java.sql.SQLException

isRootProcessorOfWorkflow

public boolean isRootProcessorOfWorkflow(java.lang.String procName,
                                         java.lang.String wfName,
                                         java.lang.String wfInstanceId)

getContainingWorkflowsForProcessor

public java.util.List<Workflow> getContainingWorkflowsForProcessor(java.lang.String pname)

getWorkflow

public Workflow getWorkflow(java.lang.String dataflowID)
returns a Workflow record from the DB given the workflow internal ID

Parameters:
dataflowID -
Returns:

getContainingCollection

public java.lang.String getContainingCollection(LineageQueryResultRecord record)
Parameters:
record - a record representing a single value -- possibly within a list hierarchy
Returns:
the URI for topmost containing collection when the input record is within a list hierarchy, or null otherwise

getAllWorkflowRecords

public java.util.ArrayList<Workflow> getAllWorkflowRecords(java.util.ArrayList<java.lang.String> workflowIDList)

getCollectionsForRun

public java.util.List<Collection> getCollectionsForRun(java.lang.String wfInstanceID)