comparison mercurial/revset.py @ 23842:91dbb98b3513

revset: make tokenize extensible to parse alias declarations and definitions Before this patch, "tokenize" doesn't recognize the symbol starting with "$" as a valid one. This prevents revset alias declarations and definitions from being parsed with "tokenize", because "$" may be used as the initial letter of alias arguments. BTW, the alias argument name doesn't require leading "$" itself, in fact. But we have to assume that users may use "$" as the initial letter of argument names in their aliases, because examples in "hg help revsets" uses such names for a long time. To make "tokenize" extensible to parse alias declarations and definitions, this patch introduces optional arguments "syminitletters" and "symletters". Giving these sets can change the policy of "valid symbol" in tokenization easily. This patch keeps original examination of letter validity for reviewability, even though there is redundant interchanging between "chr"/"ord" at initialization of "_syminitletters" and "_symletters". At most 256 times examination (per initialization) is cheaper enough than revset evaluation itself. This patch is a part of preparation for parsing alias declarations and definitions more strictly.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Sat, 10 Jan 2015 23:18:11 +0900
parents 3fb61fcbc4e4
children c4d0c3d05721
comparison
equal deleted inserted replaced
23841:9d25bb84cf6c 23842:91dbb98b3513
127 "end": (0, None, None), 127 "end": (0, None, None),
128 } 128 }
129 129
130 keywords = set(['and', 'or', 'not']) 130 keywords = set(['and', 'or', 'not'])
131 131
132 def tokenize(program, lookup=None): 132 # default set of valid characters for the initial letter of symbols
133 _syminitletters = set(c for c in [chr(i) for i in xrange(256)]
134 if c.isalnum() or c in '._@' or ord(c) > 127)
135
136 # default set of valid characters for non-initial letters of symbols
137 _symletters = set(c for c in [chr(i) for i in xrange(256)]
138 if c.isalnum() or c in '-._/@' or ord(c) > 127)
139
140 def tokenize(program, lookup=None, syminitletters=None, symletters=None):
133 ''' 141 '''
134 Parse a revset statement into a stream of tokens 142 Parse a revset statement into a stream of tokens
143
144 ``syminitletters`` is the set of valid characters for the initial
145 letter of symbols.
146
147 By default, character ``c`` is recognized as valid for initial
148 letter of symbols, if ``c.isalnum() or c in '._@' or ord(c) > 127``.
149
150 ``symletters`` is the set of valid characters for non-initial
151 letters of symbols.
152
153 By default, character ``c`` is recognized as valid for non-initial
154 letters of symbols, if ``c.isalnum() or c in '-._/@' or ord(c) > 127``.
135 155
136 Check that @ is a valid unquoted token character (issue3686): 156 Check that @ is a valid unquoted token character (issue3686):
137 >>> list(tokenize("@::")) 157 >>> list(tokenize("@::"))
138 [('symbol', '@', 0), ('::', None, 1), ('end', None, 3)] 158 [('symbol', '@', 0), ('::', None, 1), ('end', None, 3)]
139 159
140 ''' 160 '''
161 if syminitletters is None:
162 syminitletters = _syminitletters
163 if symletters is None:
164 symletters = _symletters
141 165
142 pos, l = 0, len(program) 166 pos, l = 0, len(program)
143 while pos < l: 167 while pos < l:
144 c = program[pos] 168 c = program[pos]
145 if c.isspace(): # skip inter-token whitespace 169 if c.isspace(): # skip inter-token whitespace
175 break 199 break
176 pos += 1 200 pos += 1
177 else: 201 else:
178 raise error.ParseError(_("unterminated string"), s) 202 raise error.ParseError(_("unterminated string"), s)
179 # gather up a symbol/keyword 203 # gather up a symbol/keyword
180 elif c.isalnum() or c in '._@' or ord(c) > 127: 204 elif c in syminitletters:
181 s = pos 205 s = pos
182 pos += 1 206 pos += 1
183 while pos < l: # find end of symbol 207 while pos < l: # find end of symbol
184 d = program[pos] 208 d = program[pos]
185 if not (d.isalnum() or d in "-._/@" or ord(d) > 127): 209 if d not in symletters:
186 break 210 break
187 if d == '.' and program[pos - 1] == '.': # special case for .. 211 if d == '.' and program[pos - 1] == '.': # special case for ..
188 pos -= 1 212 pos -= 1
189 break 213 break
190 pos += 1 214 pos += 1