#!/d/Bin/Python/python.exe """SPARQL implementation on top of RDFLib The basic class here is supposed to be a superclass of myTripleStore; it has been separated only for a better maintainability $Date: 2005/01/23 11:36:50 $ It essentially implements the core of the SPARQL draft: http://www.w3.org/TR/rdf-sparql-query/ There is a separate description for this in the sparqlDesc.html file The directory contains a test rdf file called Talks.rdf; this file include some testing at the end (following the __name__=='__main__' idiom of python), so one can see how RDQL works in practice. """ ## # Initial implementation of the W3C SPARQL of October 2004. # A more detailed description of the module is also available. #
Ivan Herman ## ########################################################################## from rdflib.Literal import Literal from rdflib.exceptions import Error from rdflib.Store import check_predicate, check_subject, check_object import sys, sets from types import * ########################################################################## # RDQL Utilities _questChar = "?" ## # SPARQL Error Exception (subclass of the RDFLib Exceptions) class SPARQLError(Error) : """Am SPARQL error has been detected""" def __init__(self,msg): Error.__init__(self, ("SPARQL Error: %s." % msg)) def _isResQuest(r) : if r and (type(r) == str or type(r) == unicode) and r[0] == _questChar : return True return False def _unfoldNestedLists(args) : """To unfold nested lists of the sort = [t,[t1,t2],tt,ttt] into [t,t1,tt,ttt] and [t,t2,tt,ttt]. Returns the list of lists """ allBranches = [[]] for arg in args : if type(arg) is tuple : for x in allBranches : x.append(arg) elif type(arg) is list : # All lists in allBranches must be copied as many times as there are tuples in # arg, and the elements of arg must be appended to them newAllBranches = [] for x in allBranches : for z in arg: xz = [ q for q in x ] if type(z) is tuple : xz.append(z) elif type(z) is list : xz = xz + z newAllBranches.append(xz) allBranches = newAllBranches return allBranches def _separateOptions(select,where,optional) : """the 'select' block must be cut into two: to those variables that appear in the 'optiona' clause only and the others. The latter creates a simple SPARQL series, the former the optional""" # just return with the values in case optional is empty. This happens often, so it is worth doing this def getUnbounds(patterns) : # Collect the unbound variables into an array retval = [] def addPat(r) : if r and isinstance(r,basestring) and r[0] == _questChar and (not r in retval) : retval.append(r) for t in patterns : if len(t) == 3 : s,p,o = t elif len(t) == 4 : s,p,o,f = t else : raise SPARQLError("illegal pattern tuple, length should be 3 or 4 (%s)" % t) map(addPat,(s,p,o)) return retval bind_where = getUnbounds(where) if optional == None or len(optional) == 0 : # a bit of error checking should still be done... for s in select : if not s in bind_where : raise SPARQLError("query variable '%s' does not appear in non-optional patterns" % s) return (select,(),{}) bind_opts = getUnbounds(optional) select_w = [] select_o = [] index_w = {} index_o = {} for i in xrange(0,len(select)) : s = select[i] if s in bind_opts : if s in bind_where : index_w[i] = len(select_w) select_w.append(s) else : index_o[i] = len(select_o) select_o.append(s) else : if s in bind_where : index_w[i] = len(select_w) select_w.append(s) else : raise SPARQLError("query variable '%s' does not appear in non-optional patterns" % s) if len(select_w) == 0 and len(select_o) > 0 : raise SPARQLError("no query variable for pattern detected, only in optional") if len(select_o) == 0 and len(optional) != 0 : raise SPARQLError("optional patterns defined but no query variables") # a backward index table must be created that will be used to 'rearrange' the results into # the original order rearrange={} for k in index_w : rearrange[index_w[k]] = k for k in index_o : rearrange[index_o[k]+len(select_w)] = k return (tuple(select_w),tuple(select_o),rearrange) def _createInitialBindings(patterns) : """works through the patterns list by: - taking out all patterns strings, and turning them into Null-s - add the patterns strings to a binding dictionary with a None value - if an object is a string and not a constraint, replacing it with a Literal - each tuple gets a fourth entry, which either None or, if the user has supplied it, a method Returns a tuple with the binding and the final pattern list """ bindings = {} pattns = [] for t in patterns : func = None if len(t) == 3 : (s,p,o) = t elif len(t) == 4 : (s,p,o,func) = t if func != None and type(func) != FunctionType : raise SPARQLError("illegal pattern tuple, expected a function type, got %s" % func) else : raise SPARQLError("illegal pattern tuple, length should be 3 or 4 (%s)" % t) if _isResQuest(s) : bindings[s] = None s_n = s # if _isResQuest(p) : bindings[p] = None p_n = p # The case of an object is a little bit different: a string can be turned into a Literal on the fly... if type(o) == str or type(o) == unicode : if o[0] == _questChar : bindings[o] = None o_n = o else : o_n = Literal(o) else : o_n = o pattns.append((s_n,p_n,o_n,func)) return (bindings,pattns) ########################################################################## ## # A wrapper class around the RDFLib triple store implementing the Sparql utilities. This class is # a superclass of the myTripleStore # class; in other words, the methods are usualy 'seen' through an instance of that class rather than directly. class SPARQL : """A wrapper class around the original triple store, that includes some additional utility methods""" def __init__(self) : pass ## # A shorthand for the creation of a {@link #SPARQL_Results} instance and returning # the result of a {@link #SPARQL_Results.select} right away. Good for most of the usage, # when no more action (clustering, etc) is necessary # @param select tuple with a selection criteria (see {@link #SPARQL.sparqlObject} for details # @param where statement tuples (see {@link #SPARQL.sparqlObject} for details) # @keyparam constraints list of constraint functions (see {@link #SPARQL.sparqlObject} for details) # @keyparam optional list of optional statement tuples (see {@link #SPARQL.sparqlObject} for details) # @defreturn list of query results def sparqlQuery(self,selectU,where,constraints=[],optional=[]) : """Shorthand: calls sparqlObject and returns the results right away. If no further action is needed (clustering, etc), this is the right method to use... For the arguments, see the description of sparqlObject method. The method returns an array of tuple with the search results if select has more than elements; an array of the search results otherwise. """ obj = self.sparqlObject(selectU,where,constraints=constraints,optional=optional) return obj.select() ## # Creation of a {@link #SPARQL_Results} instance # @param select a tuple with the selection criteria. Each entry is a string that begins with a "?". If not, it is ignored. # If the selection is only one single string, then the parameter can also be a single string instead # of a tuple. # @param where an array of statement tuples. Each entry in the tuple is either a string or an RDFLib Identifier (ie, a Literal # or a URIRef). If a string and if the string begins by "?", it is an unbound variable, otherwise a Literal # is created on the fly. An additional tweak: an element in 'where' may be a list of tuples, instead of a tuple. This is # interpreted as an 'or', ie, several queries will be issued by replacing the list with its elements # respectively, and the resulting bindings will be concatenated. # @keyparam constraints a list of functions. All functions will be invoked with a full binding if found. The input is a dictionary for # the binding, the return value should be true or false. # The conditions are 'and'-d, ie, if one returns false, that particular binding is rejected. # @keyparam optional like the where array. The subgraph is optional: ie, if no triple are found for these # subgraph, the search goes on (and the corresponding select entry is set to None) # @defreturn {@link #SPARQL_Results} instance def sparqlObject(self,select,where,constraints=[],optional=[]) : # Be nice to the user, and allow the simple alternative to one element lists or tuples... if type(select) == str : selectA = (select,) elif type(select) == list or type(select) == tuple : selectA = select else : raise SPARQLError("'select' argument must be a string, a list or a tuple" % t) if type(constraints) == FunctionType : constraintsA = (constraints,) elif type(constraints) == list or type(constraints) == tuple : constraintsA = constraints else : raise SPARQLError("'constraints' argument must be a function, a list or a tuple" % t) if type(where) == tuple : whereA = [where] elif type(where) == list : whereA = where else : raise SPARQLError("'where' argument must be a list or a tuple" % t) if optional : if type(optional) == tuple : optionalA = [optional] elif type(optional) == list : optionalA = optional else : raise SPARQLError("'optional' argument must be a list or a tuple" % t) else : optionalA = [] ####################### def handleOnePair(w,o) : """Manage one pair of normal and optional pattern list""" (select_r,select_o,rearrange) = _separateOptions(selectA,w,o) (bindings, statements) = _createInitialBindings(w) top = _SPARQLNode(None,bindings,statements,constraintsA,self,rearrange) top.expand() if len(select_o) > 0 : # there are also optional things to do... (bindings,statements) = _createInitialBindings(o) top.expandOptions(bindings,statements) return SPARQL_Results(select_r,select_o,top,self) # The 'where' array may include 'or' type constraints. These must be separated to # yield separate but 'pure' contraint arrays. The same for the optional part allWhere = _unfoldNestedLists(whereA) allOptional = _unfoldNestedLists(optionalA) # Separate the case if there is no optional request at all retval = None if len(optionalA) == 0 : for w in allWhere : r = handleOnePair(w,[]) if retval == None : retval = r else : retval = retval + r else : for w in allWhere : for o in allOptional : r = handleOnePair(w,o) if retval == None : retval = r else : retval = retval + r return retval ############################################################################################ ## # Result of a SPARQL query. It stores to the top of the tree, and allows some subsequent # inquiries on the expanded tree that might be useful to the user. This class should not be # instantiated by the user, it is done by the {@link #SPARQL.sparqlObject} method. # class SPARQL_Results : def __init__(self,select,select_optional,rdqlnode,triples,parent1=None,parent2=None) : self.top = rdqlnode self.select_required = select self.select_optional = select_optional self.triples = triples self.results = None self.clusterF = None self.clusterB = None self.clusterFB = None self.subgraph = None # if this node is the result of a sum... self.parent1 = parent1 self.parent2 = parent2 ## # Combine the resuls of queries via an operator # @param other another instance of Query results def __add__(self,other) : """This may be useful when several queries are performed and one wants the 'union' of those. Caveat: the triple store must be the same for each argument. This method is used internally only anyway... Efficiency trick (I hope it works): the various additions on subgraphs are not done here; the results are calculated only if really necessary, ie, in a lazy evaluation manner. This is achieved by storing self and the 'other' in the new object """ return SPARQL_Results(self.select_required,self.select_optional,None,self.triples,self,other) ## # Return the result of the query. # @keyparam distinct if True, the identical results are filtered out # @keyparam limit if set to an integer value, the first 'limit' number of results are returned; all of them otherwise # @defreturn list of selection results def select(self,distinct=True,limit=None) : """This just returns the result of the query. Using this is identical to the direct call to myTripleStore.sparql""" def _uniquefyList(lst) : """Return a copy of the list but possible duplicate elements are taken out. Used to post-process the outcome of RDQL""" if len(lst) <= 1 : return lst else : return list(sets.Set(lst)) if self.results == None : if self.parent1 != None and self.parent2 != None : self.results = self.parent1.select() + self.parent2.select() else : self.results = self.top.returnResult(self.select_required,self.select_optional) if (len(self.select_required) + len(self.select_optional)) == 1 : # the result is returned as an array of values instead of an array of one element tuples # however, as a result of the summation, it may also happen that what is in self.results # is an array of non-tuples already, so one has to be a bit careful here # (grrrr... this small facility for the user gives almost unnecessary complications # oh well, I rely on that in my applications already... # (What one doesn't do for the peace of mind of a user :-) def untuplify(t) : if type(t) == tuple : return t[0] else : return t X = [ untuplify(t) for t in self.results ] self.results = X if distinct : retval = _uniquefyList(self.results) else : retval = self.results if limit != None and limit < len(retval) : return retval[0:limit] else : return retval ## # Expand the subgraph based on the bindings. Each binding is recursively put 'back' into # the where clause to get a series of tuples that satisfy the constraints in the query. # The result is a separate triple store containing the subgraph. # @defreturn a new triple store def construct(self) : """Expand the subgraph based on the bindings. Each binding is recursively put 'back' into the where clause to get a series of tuples that satisfy the constraints in the query. The result is a separate triple store containing the subgraph. returns: a new triplestore """ from rdflibUtils import myTripleStore if self.subgraph != None : return self.subgraph elif self.parent1 != None and self.parent2 != None : self.subgraph = self.parent1.construct() + self.parent2.construct() else : self.subgraph = myTripleStore() # to be on the safe side, see if the query has been properly finished self.select() self.top.expandSubgraph(self.subgraph) return self.subgraph ## # Forward clustering, using all the results of the query as seed (when appropriate). It is # based on the usage of the # cluster forward # method of myTripleStore. # @defreturn a new triple store def clusterForward(self) : """Forward clustering, using all the results of the query as seed (when appropriate)""" from rdflibUtils import myTripleStore if self.clusterF == None : self.clusterF = myTripleStore() # to be on the safe side, see if the query has been properly finished self.select() for r in reduce(lambda x,y: list(x) + list(y),self.results,()) : try : check_subject(r) self.triples.clusterForward(r,self.clusterF) except : # no real problem, this is a literal, just forget about it continue return self.clusterF ## # Backward clustering, using all the results of the query as seed (when appropriate). It is # based on the usage of the # cluster backward # method of myTripleStore. # @defreturn a new triple store def clusterBackward(self) : """Backward clustering, using all the results of the query as seed""" from rdflibUtils import myTripleStore if self.clusterB == None : self.clusterB = myTripleStore() # to be on the safe side, see if the query has been properly finished self.select() for r in reduce(lambda x,y: list(x) + list(y),self.results,()) : self.triples.clusterBackward(r,self.clusterB) return self.clusterB ## # Cluster: a combination of {@link #SPARQL_Results.clusterBackward} and # {@link #SPARQL_Results.clusterForward} def cluster(self) : """This is the combination of all forward and backward clusters""" if self.clusterFB == None : self.clusterFB = self.clusterB + self.clusterF return self.clusterFB ## # It is just an interface to the previous methods, somewhat closer to the SPARQL draft, though this will # still change... # @keyparam forward cluster forward yes or no # @keyparam backward cluster backward yes or no def describe(self,forward=True,backward=True) : """It is just an interface to the previous methods, somewhat closer to the SPARQL draft, though this will still change...""" from rdflibUtils import myTripleStore if forward and backward : return self.cluster() elif forward : return self.clusterForward() elif backward : return self.clusterBackward() else : return myTripleStore() class _SPARQLNode: """The SPARQL implementation is based on the creation of a tree, each level for each statement in the 'where' clause of SPARQL. Each node maintains a 'binding' dictionary, with the variable names and either a None if not yet bound, or the binding itself. The method 'expand' tries to make one more step of binding by looking at the next statement: it takes the statement of the current node, binds the variables if there is already a binding, and looks at the triple store for the possibilities. If it finds valid new triplets, that will bind some more variables, and children will be created with the next statement in the 'where' array with a new level of bindings. This is done for each triplet found in the store, thereby branching off the tree. If all variables are already bound but the statement, with the bound variables, is not 'true' (ie, there is no such triple in the store), the node is marked as 'clash' and no more expansion is made; this node will then be thrown away by the parent. If *all* children of a node is a clash, then it is marked as a clash itself. At the end of the process, the leaves of the tree are searched; if a leaf is such that: - all variables are bound - there is no clash then the bindings are returned as possible answers to the query. """ def __init__(self,parent,bindings,statements,constraints,tripleStore,rearrange={}) : """ Parameters: ----------- parent: parent node bindings: a dictionary with the bindinds that are already done statements: array of statements from the 'where' clause. The first element is for the current node, the rest for the children. If empty, then no expansion will occur (this is a leaf) constraints: array of functions, see description in the rdflibUtils class triplStore: the 'owner' triple store rearrange: an index transfer table if there are optional patterns: it indicates how to reorder the resulsts into the order of the original query Instance variables: self.parent, self.children: the usual tree variables self.bindings: copy of the bindings locally self.statement: the current statement self.rest: the rest of the statements (an array) self.clash: intialized to False self.bound: True or False depending on whether all variables are bound in self.binding """ self.tripleStore = tripleStore self.bindings = bindings self.constraints = constraints self.rearrange = rearrange self.optionalTree = None if None in bindings.values() : self.bound = False else : self.bound = True self.clash = False self.parent = parent self.children = [] if len(statements) > 0 : self.statement = statements[0] self.rest = statements[1:] else : self.statement = None self.rest = None def returnResult(self,select,selectOptional = None) : """Collect the result by search the leaves of the the tree. Parameters: ----------- select: the original select array The search result is an array of tuples with the possible bindings of select if select has more than one elements; an array of the bindings if select consists of one element only. """ if len(self.children) > 0 : # reminder: reduce means to take the list argument's elements one by one and apply # the lambda function there. In effect, this will concatenate all the lists into one # single list. # the list comprehension collect all the data into a list allResults = [c.returnResult(select,selectOptional) for c in self.children] # allResults is a list of lists, this has to be flattened: retval = reduce(lambda x,y: x+y, allResults, []) if len(self.rearrange) > 0 : def reorder(t) : lst = [None] * len(t) for k in self.rearrange : lst[self.rearrange[k]] = t[k] return tuple(lst) return [ reorder(t) for t in retval ] else : return retval else : retval = [] if self.bound == True and self.clash == False : if len(select) == 1 and len(selectOptional) == 0 : retval = [(self.bindings[select[0]],)] else : result = [self.bindings[a] for a in select] if len(selectOptional) == 0 : retval = [tuple(result)] else : optresults = self.optionalTree.returnResult(selectOptional,()) if len(optresults) == 0 : # the optional part did not produce anything, but the result # must be extended by as many None-s as there are in the returned # results result = result + [None for i in selectOptional] retval = [tuple(result)] else : # the result is a list of tuples. Each tuple must be added to the # current results, and create tuples that way res = tuple(result) retval = [res + t for t in optresults] return retval def expandSubgraph(self,subTriples) : """Expand the subgraph based on the bindings. Each binding is recursively put 'back' into the where clause to get a series of tuples that satisfy the constraints in the query. The result is a separate triple store containing the subgraph. """ def b(r,bind) : if type(r) == str : return bind[r] else : return r if len(self.children) > 0 : # all children return an array of bindings (each element being a dictionary) retval = reduce(lambda x,y: x+y, [x.expandSubgraph(subTriples) for x in self.children],[]) (s,p,o,func) = self.statement for bind in retval : st = (b(s,bind),b(p,bind),b(o,bind)) subTriples.add(st) return retval else : # nothing to do here, just return the bindings if self.bound == True and self.clash == False : return [self.bindings] else : return [] def expand(self) : """The expansion itself. See class comments for details""" # if there are no more statements, that means that the constraints have been fully expanded if self.statement : # decompose the statement into subject, predicate and object # default setting for the search statement # see if subject (resp. predicate and object) is already bound. This # is done by taking over the content of self.dict if not None and replacing # the subject with that binding # the (search_subject,search_predicate,search_object) is then created def bind(r) : if type(r) == str : if self.bindings[r] == None : return None else : return self.bindings[r] else : return r (s,p,o,func) = self.statement search = (bind(s),bind(p),bind(o)) #### # go to the triple store with the search pattern for result in self.tripleStore.triples(search) : # if a user defined constraint has been added, it should be checked now if func != None and func(result[0],result[1],result[2]) == False : # Oops, this result is not acceptable, jump over it! continue # create a copy of the current bindings, by also adding the new ones from result of the search new_bindings = self.bindings.copy() # add the newly found bindings def replace(search_tag,result_tag,statement) : if search_tag == None : new_bindings[statement] = result_tag map(replace,search,result,(s,p,o)) # Recursion starts here: create and expand a new child child = _SPARQLNode(self,new_bindings,self.rest,self.constraints,self.tripleStore) child.expand() # if the child is a clash then no use adding it to the tree, it can be forgotten if self.clash == False : self.children.append(child) if len(self.children) == 0 : # this means that the constraints could not be met at all with this binding!!!! self.clash = True else : # this is if all bindings are done; the conditions are still to be checked if self.bound == True and self.clash == False : for func in self.constraints : if func(self.bindings) == False : self.clash = True break def expandOptions(self,bindings,statements) : """Managing optional statements. These affect leaf nodes only, if they contain 'real' results. A separate RDQL tree is appended to such a node""" def replace(key,resource,tupl) : s,p,o,func = tupl if key == s : s = resource if key == p : p = resource if key == o : o = resource return (s,p,o,func) if len(self.children) == 0 : if self.bound == True and self.clash == False : # see if the optional bindings can be reduced because they are already # bound by this node for key in self.bindings : statements = [ replace(key,self.bindings[key],t) for t in statements ] if key in bindings : del bindings[key] self.optionalTree = _SPARQLNode(None,bindings,statements,self.constraints,self.tripleStore) self.optionalTree.expand() else : for c in self.children : c.expandOptions(bindings,statements)