Package RDFClosure :: Package parsers :: Package n3p :: Module n3p
[hide private]
[frames] | no frames]

Source Code for Module RDFClosure.parsers.n3p.n3p

  1  #!/usr/bin/env python 
  2  """ 
  3  N3P - An N3 Parser using n3.n3 
  4  Author: Sean B. Palmer, inamidst.com 
  5  Licence: GPL 2; share and enjoy! 
  6  License: http://www.w3.org/Consortium/Legal/copyright-software 
  7  Documentation: http://inamidst.com/n3p/ 
  8  Derived from: 
  9     http://www.w3.org/2000/10/swap/grammar/predictiveParser.py 
 10     - predictiveParser.py, Tim Berners-Lee, 2004 
 11  Issues: 
 12     http://lists.w3.org/Archives/Public/public-cwm-bugs/2005Jan/0006 
 13     http://lists.w3.org/Archives/Public/public-cwm-talk/2005JanMar/0015 
 14  """ 
 15   
 16  import sys, os, re, urllib 
 17  import cPickle as pickle 
 18   
 19  try: set() 
 20  except NameError: 
 21     from sets import Set as set 
 22   
 23  try: 
 24     import n3meta 
 25     branches = n3meta.branches 
 26     regexps = n3meta.regexps 
 27  except ImportError: 
 28     for path in sys.path: 
 29        fn = os.path.join(path, 'n3meta.pkl') 
 30        if os.path.isfile(fn): 
 31           f = open(fn, 'rb') 
 32           n3meta = pickle.load(f) 
 33           f.close() 
 34   
 35           branches = n3meta['branches'] 
 36           regexps = n3meta['regexps'] 
 37           break 
 38   
 39  start = 'http://www.w3.org/2000/10/swap/grammar/n3#document' 
 40   
 41  r_whitespace = re.compile(r'[ \t\r\n]*(?:(?:#[^\n]*)?\r?(?:$|\n))?') 
 42  singleCharacterSelectors = "\t\r\n !\"#$%&'()*.,+/;<=>?[\\]^`{|}~" 
 43  r_qname = re.compile(r'([A-Za-z0-9_:]*)') 
 44  r_name = re.compile(r'([A-Za-z0-9_]*)') 
 45  notQNameChars = singleCharacterSelectors + "@" 
 46  notNameChars = notQNameChars + ":" 
 47   
48 -def abbr(prodURI):
49 return prodURI.split('#').pop()
50
51 -class N3Parser(object):
52 - def __init__(self, uri, branches, regexps) :
53 if uri == 'nowhere': pass 54 elif (uri != 'file:///dev/stdin'): 55 u = urllib.urlopen(uri) 56 self.data = u.read() 57 u.close() 58 else: self.data = sys.stdin.read() 59 self.pos = 0 60 self.branches = branches 61 self.regexps = regexps 62 self.keywordMode = False 63 self.keywords = set(("a", "is", "of", "this", "has")) 64 self.productions = [] 65 self.memo = {}
66
67 - def parse(self, prod):
68 todoStack = [[prod, None]] 69 while todoStack: 70 if todoStack[-1][1] is None: 71 todoStack[-1][1] = [] 72 tok = self.token() 73 # Got an opened production 74 self.onStart(abbr(todoStack[-1][0])) 75 if not tok: return tok # EOF 76 77 prodBranch = self.branches[todoStack[-1][0]] 78 sequence = prodBranch.get(tok, None) 79 if sequence is None: 80 msg = "Found %s when expecting a %s . todoStack=%r" 81 args = (tok, todoStack[-1][0], todoStack) 82 raise ValueError, (msg % args) 83 for term in sequence: 84 todoStack[-1][1].append(term) 85 while todoStack[-1][1]: 86 term = todoStack[-1][1].pop(0) 87 if isinstance(term, unicode): 88 j = self.pos + len(term) 89 word = self.data[self.pos:j] 90 if word == term: 91 self.onToken(term, word) 92 self.pos = j 93 elif '@' + word[:-1] == term: 94 self.onToken(term, word[:-1]) 95 self.pos = j - 1 96 else: 97 msg = "Found %s; %s expected" 98 args = (self.data[self.pos:self.pos+10], term) 99 raise ValueError, (msg % args) 100 elif not self.regexps.has_key(term): 101 todoStack.append([term, None]) 102 continue 103 else: 104 regexp = self.regexps[term] 105 m = regexp.match(self.data, self.pos) 106 if not m: 107 msg = "Token: %r should match %s" 108 args = (self.data[self.pos:self.pos+10], regexp.pattern) 109 raise ValueError, (msg % args) 110 end = m.end() 111 self.onToken(abbr(term), self.data[self.pos:end]) 112 self.pos = end 113 self.token() 114 while todoStack[-1][1] == []: 115 todoStack.pop() 116 self.onFinish()
117
118 - def token(self):
119 """Memoizer for getToken.""" 120 if self.memo.has_key(self.pos): 121 return self.memo[self.pos] 122 result = self.getToken() 123 pos = self.pos 124 self.memo[pos] = result 125 return result
126
127 - def getToken(self):
128 self.whitespace() 129 if self.pos == len(self.data): 130 return '' # EOF! 131 132 ch2 = self.data[self.pos:self.pos+2] 133 for double in ('=>', '<=', '^^'): 134 if ch2 == double: return double 135 136 ch = self.data[self.pos] 137 if ch == '.' and self.keywordMode: 138 self.keywordMode = False 139 140 if ch in singleCharacterSelectors + '"': 141 return ch 142 elif ch in '+-0123456789': 143 return '0' 144 145 if ch == '@': 146 if self.pos and (self.data[self.pos-1] == '"'): 147 return '@' 148 name = r_name.match(self.data, self.pos + 1).group(1) 149 if name == 'keywords': 150 self.keywords = set() 151 self.keywordMode = True 152 return '@' + name 153 154 word = r_qname.match(self.data, self.pos).group(1) 155 if self.keywordMode: 156 self.keywords.add(word) 157 elif word in self.keywords: 158 if word == 'keywords': 159 self.keywords = set() 160 self.keywordMode = True 161 return '@' + word # implicit keyword 162 return 'a'
163
164 - def whitespace(self):
165 while True: 166 end = r_whitespace.match(self.data, self.pos).end() 167 if end <= self.pos: break 168 self.pos = end
169
170 - def onStart(self, prod):
171 print (' ' * len(self.productions)) + prod 172 self.productions.append(prod)
173
174 - def onFinish(self):
175 prod = self.productions.pop() 176 print (' ' * len(self.productions)) + '/' + prod
177
178 - def onToken(self, prod, tok):
179 print (' ' * len(self.productions)) + prod, tok
180
181 -def main(argv=None):
182 if argv is None: 183 argv = sys.argv 184 if len(argv) == 2: 185 p = N3Parser(argv[1], branches, regexps) 186 p.parse(start)
187 188 if __name__=="__main__": 189 main() 190