Package RDFClosure :: Package parsers :: Package n3p :: Module n3proc
[hide private]
[frames] | no frames]

Source Code for Module RDFClosure.parsers.n3p.n3proc

  1  #!/usr/bin/env python 
  2  """ 
  3  n3proc - An N3 Processor using n3.n3 
  4  Author: Sean B. Palmer, inamidst.com 
  5  Licence: GPL 2; share and enjoy! 
  6  License: http://www.w3.org/Consortium/Legal/copyright-software 
  7  Documentation: http://inamidst.com/n3p/ 
  8   
  9  usage: 
 10     %prog [options] <URI> 
 11  """ 
 12   
 13  from rdflib import URIRef, BNode, Literal, Variable, Namespace 
 14  from rdflib.Graph import QuotedGraph 
 15   
 16  import sys, os.path, re, time, urllib 
 17  import n3p 
 18   
 19  try: from uripath import join as urijoin 
 20  except ImportError: 
 21     print >> sys.stderr, "uripath.py not found" 
 22     from urlparse import urljoin as urijoin 
 23   
 24  RDF = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') 
 25  OWL = Namespace('http://www.w3.org/2002/07/owl#') 
 26  LOG = Namespace('http://www.w3.org/2000/10/swap/log#') 
 27  XSD = Namespace('http://www.w3.org/2001/XMLSchema#') 
 28  N3R = Namespace('http://www.w3.org/2000/10/swap/reify#') 
 29   
 30  r_unilower = re.compile(r'(?<=\\u)([0-9a-f]{4})|(?<=\\U)([0-9a-f]{8})') 
 31  r_hibyte = re.compile(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\xFF]') 
 32   
33 -def quote(s):
34 if not isinstance(s, unicode): 35 s = unicode(s, 'utf-8') # @@ not required? 36 if not (u'\\'.encode('unicode-escape') == '\\\\'): 37 s = s.replace('\\', r'\\') 38 s = s.replace('"', r'\"') 39 # s = s.replace(r'\\"', r'\"') 40 s = r_hibyte.sub(lambda m: '\\u00%02X' % ord(m.group(0)), s) 41 s = s.encode('unicode-escape') 42 s = r_unilower.sub(lambda m: (m.group(1) or m.group(2)).upper(), s) 43 return str(s)
44 45 46 quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\': '\\'} 47 48 r_quot = re.compile(r'\\(t|n|r|"|\\)') 49 r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})') 50
51 -class ParseError(Exception):
52 pass
53
54 -def unquote(s, triplequoted=False, r_safe = re.compile(ur'([\x20\x21\x23-\x5B\x5D-\x7E\u00A0-\uFFFF]+)')):
55 """Unquote an N-Triples string. 56 Derived from: http://inamidst.com/proj/rdf/ntriples.py 57 """ 58 result = [] 59 while s: 60 m = r_safe.match(s) 61 if m: 62 s = s[m.end():] 63 result.append(m.group(1)) 64 continue 65 66 m = r_quot.match(s) 67 if m: 68 s = s[2:] 69 result.append(quot[m.group(1)]) 70 continue 71 72 m = r_uniquot.match(s) 73 if m: 74 s = s[m.end():] 75 u, U = m.groups() 76 codepoint = int(u or U, 16) 77 if codepoint > 0x10FFFF: 78 raise ParseError("Disallowed codepoint: %08X" % codepoint) 79 result.append(unichr(codepoint)) 80 elif s.startswith('\\'): 81 raise ParseError("Illegal escape at: %s..." % s[:10]) 82 elif triplequoted and (s[0] in '\n"'): 83 result.append(s[0]) 84 s = s[1:] 85 else: raise ParseError("Illegal literal character: %r" % s[0]) 86 return unicode(''.join(result))
87 88 branches = n3p.branches 89 regexps = n3p.regexps 90 start = n3p.start 91
92 -class N3Processor(n3p.N3Parser):
93 - def __init__(self, uri, sink, baseURI=False):
94 super(N3Processor, self).__init__(uri, branches, regexps) 95 if baseURI is False: 96 self.baseURI = uri 97 else: self.baseURI = baseURI 98 self.sink = sink 99 self.bindings = {'': urijoin(self.baseURI, '#')} 100 self.counter = 0 101 self.prefix = False 102 self.userkeys = False 103 self.anonsubj = False 104 self.litinfo = False 105 self.forAll = False 106 self.forSome = False 107 self.universals = {} 108 self.existentials = {} 109 self.formulae = [] 110 self.labels = [] 111 self.mode = [] 112 self.triples = [] 113 self.pathmode = 'path' 114 self.paths = [] 115 self.lists = [] 116 self.bnodes = {}
117
118 - def parse(self, start=start):
119 super(N3Processor, self).parse(start)
120
121 - def onStart(self, prod):
122 self.productions.append(prod) 123 handler = prod + 'Start' 124 if hasattr(self, handler): 125 getattr(self, handler)(prod)
126
127 - def onFinish(self):
128 prod = self.productions.pop() 129 handler = prod + 'Finish' 130 if hasattr(self, handler): 131 getattr(self, handler)()
132
133 - def onToken(self, prod, tok):
134 if self.productions: 135 parentProd = self.productions[-1] 136 # This is to handle the true/false... 137 if parentProd == "symbol" and (tok == "true" or tok == "false") : 138 if tok == "true" : 139 lit = Literal("true",datatype=XSD.boolean) 140 elif tok == "false" : 141 lit = Literal("false",datatype=XSD.boolean) 142 if self.paths: 143 self.paths[-1].append(lit) 144 elif self.mode and self.mode[-1] == 'list': 145 self.lists[-1].append(lit) 146 else: self.triples[-1].append(lit) 147 else : 148 handler = parentProd + 'Token' 149 if hasattr(self, handler): 150 getattr(self, handler)(prod, tok) 151 else: raise Exception("Token has no parent production.")
152
153 - def documentStart(self, prod):
154 formula = self.sink.graph 155 self.formulae.append(formula) 156 self.sink.start(formula)
157
158 - def declarationToken(self, prod, tok):
159 if prod == '@prefix': 160 self.prefix = [] 161 elif prod == '@keywords': 162 self.userkeys = True # bah 163 elif (self.prefix is not False) and prod == 'qname': 164 self.prefix.append(tok[:-1]) 165 elif prod == 'explicituri': 166 self.prefix.append(tok[1:-1])
167
168 - def declarationFinish(self):
169 if self.prefix: 170 self.bindings[self.prefix[0]] = self.prefix[1] 171 self.prefix = False
172
173 - def universalStart(self, prod):
174 self.forAll = []
175
176 - def universalFinish(self):
177 for term in self.forAll: 178 v = self.univar('var') 179 self.universals[term] = (self.formulae[-1], v) 180 self.sink.quantify(self.formulae[-1], v) 181 self.forAll = False
182
183 - def existentialStart(self, prod):
184 self.forSome = []
185
186 - def existentialFinish(self):
187 for term in self.forSome: 188 b = BNode() 189 self.existentials[term] = (self.formulae[-1], b) 190 self.sink.quantify(self.formulae[-1], b) 191 self.forSome = False
192
193 - def simpleStatementStart(self, prod):
194 self.triples.append([])
195
196 - def simpleStatementFinish(self):
197 if self.triples: 198 self.triples.pop()
199
200 - def pathStart(self, prod):
201 # p = self.paths 202 # if not (p and p[-1] and (p[-1][-1] in '!^')): 203 if (not self.paths) or (self.pathmode == 'path'): 204 self.paths.append([]) 205 self.pathcounter = 1 206 else: self.pathcounter += 1 207 self.pathmode = 'path'
208
209 - def pathtailStart(self, prod):
210 self.pathcounter += 1 211 self.pathmode = 'pathtail'
212
213 - def pathtailToken(self, prod, tok):
214 if prod == '!': 215 self.paths[-1].append('!') 216 elif prod == '^': 217 self.paths[-1].append('^')
218
219 - def pathtailFinish(self):
220 self.pathcounter -= 1
221
222 - def pathFinish(self):
223 self.pathcounter -= 1 224 self.pathmode = 'path' 225 if self.paths and (self.pathcounter < 1): 226 path = self.paths.pop() 227 if not path: pass 228 elif len(path) == 1: 229 term = path.pop() 230 if self.mode and self.mode[-1] == 'list': 231 self.lists[-1].append(term) 232 else: self.triples[-1].append(term) 233 else: # A path traversal 234 objt, path = path[0], path[1:] 235 for (i, pred) in enumerate(path): 236 if (i % 2) != 0: 237 subj = objt 238 objt = BNode() 239 if path[i-1] == '!': 240 self.triple(subj, pred, objt) 241 elif path[i-1] == '^': 242 self.triple(objt, pred, subj) 243 # @@ nested paths? 244 if self.mode and self.mode[-1] == 'list': 245 self.lists[-1].append(objt) 246 else: self.triples[-1].append(objt)
247 # if self.anonsubj is True: 248 # self.anonsubj = False 249 # self.path = False 250
251 - def nodeToken(self, prod, tok):
252 nodedict = {} 253 254 def ointerp(prod, tok): 255 b = BNode() 256 # Record here if it's a subject node 257 if self.anonsubj: 258 self.anonsubj = False 259 if ((not self.triples) or 260 (False not in map(lambda s: not len(s), self.triples)) or 261 (len(self.triples[-1]) == 3) or 262 (len(self.triples) > 1 and 263 len(self.triples[-2]) == 3 and 264 not len(self.triples[-1]))): 265 self.anonsubj = True 266 if (self.paths and 267 self.paths[-1] and 268 self.paths[-1][-1] in '!^'): 269 self.anonsubj = 'path' 270 271 if self.paths: 272 self.paths[-1].append(b) 273 self.triples.append([b]) 274 elif self.mode and self.mode[-1] == 'list': 275 self.lists[-1].append(b) 276 self.triples.append([b]) 277 # else: self.triples[-1].append(b) 278 279 elif len(self.triples[-1]) > 1: 280 self.triples.append([b]) 281 self.mode.append('triple')
282 nodedict['['] = ointerp 283 284 def cinterp(prod, tok): 285 if ((not self.anonsubj) or 286 (self.paths and len(self.paths[-1]) == 1)): 287 self.triples.pop() 288 elif self.anonsubj == 'path': 289 self.triples.pop() 290 self.triples.append([]) 291 else: self.anonsubj = False 292 self.mode.pop()
293 nodedict[']'] = cinterp 294 295 def oparen(prod, tok): 296 self.lists.append([]) 297 self.mode.append('list') 298 nodedict['('] = oparen 299 300 def cparen(prod, tok): 301 items = self.lists.pop() 302 if items: 303 first = head = BNode() 304 for (i, item) in enumerate(items): 305 if i < len(items) - 1: 306 rest = BNode() 307 else: rest = RDF.nil 308 self.triple(first, RDF.first, item) 309 self.triple(first, RDF.rest, rest) 310 first = rest 311 else: head = RDF.nil 312 self.mode.pop() 313 if self.paths: 314 self.paths[-1].append(head) 315 elif self.mode and self.mode[-1] == 'list': 316 self.lists[-1].append(head) 317 else: self.triples[-1].append(head) 318 nodedict[')'] = cparen 319 320 def obrace(prod, tok): 321 f = self.formula() 322 if self.paths: 323 self.paths[-1].append(f) 324 elif self.mode and self.mode[-1] == 'list': 325 self.lists[-1].append(f) 326 else: self.triples[-1].append(f) 327 328 self.formulae.append(f) 329 self.labels.append('f' + str(self.counter)) 330 nodedict['{'] = obrace 331 332 def cbrace(prod, tok): 333 self.formulae.pop() 334 self.labels.pop() 335 if self.triples and (len(self.triples[-1]) == 3): 336 self.triple(*self.triples[-1]) 337 self.triples[-1].pop() 338 nodedict['}'] = cbrace 339 340 def numericliteral(prod, tok): 341 if '.' in tok: 342 tok = str(float(tok)) 343 lit = Literal(tok, datatype=XSD.decimal) 344 else: 345 tok = str(int(tok)) 346 lit = Literal(tok, datatype=XSD.integer) 347 if self.paths: 348 self.paths[-1].append(lit) 349 elif self.mode and self.mode[-1] == 'list': 350 self.lists[-1].append(lit) 351 else: self.triples[-1].append(lit) 352 nodedict['numericliteral'] = numericliteral 353 354 def variable(prod, tok): 355 var = self.univar(tok[1:], sic=True) 356 if self.paths: 357 self.paths[-1].append(var) 358 elif self.mode and self.mode[-1] == 'list': 359 self.lists[-1].append(var) 360 else: self.triples[-1].append(var) 361 nodedict['variable'] = variable 362 363 def this(prod, tok): 364 formula = self.formulae[-1] 365 if self.paths: 366 self.paths[-1].append(formula) 367 elif self.mode and self.mode[-1] == 'list': 368 self.lists[-1].append(formula) 369 else: self.triples[-1].append(formula) 370 nodedict['@this'] = this 371 372 try: nodedict[prod](prod, tok) 373 except KeyError: pass 374
375 - def literalStart(self, prod):
376 self.litinfo = {}
377
378 - def literalToken(self, prod, tok):
379 if prod == 'string': 380 self.litinfo['content'] = tok
381
382 - def dtlangToken(self, prod, tok):
383 if prod == 'langcode': 384 self.litinfo['language'] = tok
385
386 - def symbolToken(self, prod, tok):
387 if prod == 'explicituri': 388 term = self.uri(tok[1:-1]) 389 elif prod == 'qname': 390 term = self.qname(tok) 391 392 if self.litinfo: 393 self.litinfo['datatype'] = term 394 elif self.forAll is not False: 395 self.forAll.append(term) 396 elif self.forSome is not False: 397 self.forSome.append(term) 398 elif self.paths: 399 self.paths[-1].append(term) 400 elif self.mode and self.mode[-1] == 'list': 401 self.lists[-1].append(term) 402 else: self.triples[-1].append(term)
403
404 - def literalFinish(self):
405 content = self.litinfo['content'] 406 language = self.litinfo.get('language') 407 datatype = self.litinfo.get('datatype') 408 409 lit = self.literal(content, language, datatype) 410 if self.paths: 411 self.paths[-1].append(lit) 412 elif self.mode and self.mode[-1] == 'list': 413 self.lists[-1].append(lit) 414 else: self.triples[-1].append(lit) 415 self.litinfo = False
416
417 - def objectFinish(self):
418 if self.triples and (len(self.triples[-1]) == 3): 419 self.triple(*self.triples[-1]) 420 self.triples[-1].pop()
421
422 - def propertylisttailToken(self, prod, tok):
423 if prod == ';': 424 self.triples[-1] = [self.triples[-1][0]]
425
426 - def verbToken(self, prod, tok):
427 vkwords ={'@a': RDF.type, '=': OWL.sameAs, 428 '=>': LOG.implies, '<=': LOG.implies} 429 if vkwords.has_key(prod): 430 term = vkwords[prod] 431 # if self.paths: 432 # self.paths[-1].append(term) 433 if self.mode and self.mode[-1] == 'list': 434 self.lists[-1].append(term) 435 else: self.triples[-1].append(term) 436 437 if prod in ('@of', '<='): 438 # @@ test <= in CWM 439 verb = (self.triples[-1][1],) 440 self.triples[-1][1] = verb
441
442 - def triple(self, subj, pred, objt):
443 scp = self.formulae[-1] 444 if not isinstance(pred, tuple): 445 self.sink.statement(subj, pred, objt, scp) 446 else: self.sink.statement(objt, pred[0], subj, scp)
447
448 - def qname(self, tok):
449 if ':' in tok: 450 prefix, name = tok.split(':') 451 elif self.userkeys: 452 prefix, name = '', tok 453 else: raise ParseError("Set user @keywords to use barenames.") 454 if (prefix == '_') and (not self.bindings.has_key('_')): 455 if name in self.bnodes: 456 bnode = self.bnodes[name] 457 else: 458 bnode = BNode() 459 self.bnodes[name] = bnode 460 return bnode 461 462 elif not self.bindings.has_key(prefix): 463 print >> sys.stderr, "Prefix not bound: %s" % prefix 464 return self.uri(self.bindings[prefix] + name)
465
466 - def uri(self, tok):
467 u = URIRef(urijoin(self.baseURI, tok)) 468 if self.universals.has_key(u): 469 formula, var = self.universals[u] 470 if formula in self.formulae: 471 return var 472 if self.existentials.has_key(u): # @@ elif? 473 formula, bnode = self.existentials[u] 474 if formula in self.formulae: 475 return bnode 476 return u
477
478 - def formula(self):
479 formula_id = BNode() 480 if formula_id == self.sink.graph.identifier: 481 return self.sink.graph 482 else: 483 return QuotedGraph(store=self.sink.graph.store, identifier=formula_id)
484 #return self.sink.graph.get_context(formula_id, quoted=True) 485
486 - def literal(self, content, language, datatype):
487 if content.startswith('"""'): 488 content = unquote(content[3:-3].decode('utf-8'), triplequoted=True) 489 else: content = unquote(content[1:-1].decode('utf-8')) 490 return Literal(content, language, datatype)
491
492 - def univar(self, label, sic=False):
493 if not sic: 494 self.counter += 1 495 label += str(self.counter) 496 return Variable(label)
497 498
499 -class NTriplesSink(object):
500 - def __init__(self, out=None):
501 self.out = out or sys.stdout 502 self.counter = 0
503
504 - def start(self, root):
505 self.root = root
506
507 - def statement(self, s, p, o, f):
508 if f == self.root: 509 self.out.write("%s %s %s .\n" % (s, p, o)) 510 else: self.flatten(s, p, o, f)
511
512 - def quantify(self, formula, var):
513 if formula != self.root: 514 if var.startswith('_'): pred = N3R.existential 515 elif var.startswith('?'): pred = N3R.universal 516 self.out.write("%s %s %s .\n" % (formula, pred, var))
517
518 - def makeStatementID(self):
519 return BNode()
520
521 - def flatten(self, s, p, o, f):
522 fs = self.makeStatementID() 523 self.out.write("%s %s %s .\n" % (f, N3R.statement, fs)) 524 self.out.write("%s %s %s .\n" % (fs, N3R.subject, s)) 525 self.out.write("%s %s %s .\n" % (fs, N3R.predicate, p)) 526 self.out.write("%s %s %s .\n" % (fs, N3R.object, o))
527
528 -def parse(uri, options):
529 baseURI = options.baseURI 530 sink = NTriplesSink() 531 if options.root: 532 sink.quantify = lambda *args: True 533 sink.flatten = lambda *args: True 534 if ':' not in uri: 535 uri = 'file://' + os.path.join(os.getcwd(), uri) 536 if baseURI and (':' not in baseURI): 537 baseURI = 'file://' + os.path.join(os.getcwd(), baseURI) 538 p = N3Processor(uri, sink, baseURI=baseURI) 539 p.parse()
540
541 -def main(argv=None):
542 import optparse 543 544 class MyHelpFormatter(optparse.HelpFormatter): 545 def __init__(self): 546 kargs = {'indent_increment': 2, 'short_first': 1, 547 'max_help_position': 25, 'width': None} 548 optparse.HelpFormatter.__init__(self, **kargs)
549 def format_usage(self, usage): 550 return optparse._("%s") % usage.lstrip() 551 def format_heading(self, heading): 552 return "%*s%s:\n" % (self.current_indent, "", heading) 553 formatter = MyHelpFormatter() 554 555 parser = optparse.OptionParser(usage=__doc__, formatter=formatter) 556 parser.add_option("-b", "--baseURI", dest="baseURI", default=False, 557 help="set the baseURI", metavar="URI") 558 parser.add_option("-r", "--root", dest="root", 559 action="store_true", default=False, 560 help="print triples in the root formula only") 561 options, args = parser.parse_args(argv) 562 563 if len(args) == 1: 564 parse(args[0], options) 565 else: parser.print_help() 566 567 if __name__=="__main__": 568 main() 569