lexer.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. # mako/lexer.py
  2. # Copyright 2006-2025 the Mako authors and contributors <see AUTHORS file>
  3. #
  4. # This module is part of Mako and is released under
  5. # the MIT License: http://www.opensource.org/licenses/mit-license.php
  6. """provides the Lexer class for parsing template strings into parse trees."""
  7. import codecs
  8. import re
  9. from mako import exceptions
  10. from mako import parsetree
  11. from mako.pygen import adjust_whitespace
  12. _regexp_cache = {}
  13. class Lexer:
  14. def __init__(
  15. self, text, filename=None, input_encoding=None, preprocessor=None
  16. ):
  17. self.text = text
  18. self.filename = filename
  19. self.template = parsetree.TemplateNode(self.filename)
  20. self.matched_lineno = 1
  21. self.matched_charpos = 0
  22. self.lineno = 1
  23. self.match_position = 0
  24. self.tag = []
  25. self.control_line = []
  26. self.ternary_stack = []
  27. self.encoding = input_encoding
  28. if preprocessor is None:
  29. self.preprocessor = []
  30. elif not hasattr(preprocessor, "__iter__"):
  31. self.preprocessor = [preprocessor]
  32. else:
  33. self.preprocessor = preprocessor
  34. @property
  35. def exception_kwargs(self):
  36. return {
  37. "source": self.text,
  38. "lineno": self.matched_lineno,
  39. "pos": self.matched_charpos,
  40. "filename": self.filename,
  41. }
  42. def match(self, regexp, flags=None):
  43. """compile the given regexp, cache the reg, and call match_reg()."""
  44. try:
  45. reg = _regexp_cache[(regexp, flags)]
  46. except KeyError:
  47. reg = re.compile(regexp, flags) if flags else re.compile(regexp)
  48. _regexp_cache[(regexp, flags)] = reg
  49. return self.match_reg(reg)
  50. def match_reg(self, reg):
  51. """match the given regular expression object to the current text
  52. position.
  53. if a match occurs, update the current text and line position.
  54. """
  55. mp = self.match_position
  56. match = reg.match(self.text, self.match_position)
  57. if match:
  58. (start, end) = match.span()
  59. self.match_position = end + 1 if end == start else end
  60. self.matched_lineno = self.lineno
  61. cp = mp - 1
  62. if cp >= 0 and cp < self.textlength:
  63. cp = self.text[: cp + 1].rfind("\n")
  64. self.matched_charpos = mp - cp
  65. self.lineno += self.text[mp : self.match_position].count("\n")
  66. return match
  67. def parse_until_text(self, watch_nesting, *text):
  68. startpos = self.match_position
  69. text_re = r"|".join(text)
  70. brace_level = 0
  71. paren_level = 0
  72. bracket_level = 0
  73. while True:
  74. match = self.match(r"#.*\n")
  75. if match:
  76. continue
  77. match = self.match(
  78. r"(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1", re.S
  79. )
  80. if match:
  81. continue
  82. match = self.match(r"(%s)" % text_re)
  83. if match and not (
  84. watch_nesting
  85. and (brace_level > 0 or paren_level > 0 or bracket_level > 0)
  86. ):
  87. return (
  88. self.text[
  89. startpos : self.match_position - len(match.group(1))
  90. ],
  91. match.group(1),
  92. )
  93. elif not match:
  94. match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
  95. if match:
  96. brace_level += match.group(1).count("{")
  97. brace_level -= match.group(1).count("}")
  98. paren_level += match.group(1).count("(")
  99. paren_level -= match.group(1).count(")")
  100. bracket_level += match.group(1).count("[")
  101. bracket_level -= match.group(1).count("]")
  102. continue
  103. raise exceptions.SyntaxException(
  104. "Expected: %s" % ",".join(text), **self.exception_kwargs
  105. )
  106. def append_node(self, nodecls, *args, **kwargs):
  107. kwargs.setdefault("source", self.text)
  108. kwargs.setdefault("lineno", self.matched_lineno)
  109. kwargs.setdefault("pos", self.matched_charpos)
  110. kwargs["filename"] = self.filename
  111. node = nodecls(*args, **kwargs)
  112. if len(self.tag):
  113. self.tag[-1].nodes.append(node)
  114. else:
  115. self.template.nodes.append(node)
  116. # build a set of child nodes for the control line
  117. # (used for loop variable detection)
  118. # also build a set of child nodes on ternary control lines
  119. # (used for determining if a pass needs to be auto-inserted
  120. if self.control_line:
  121. control_frame = self.control_line[-1]
  122. control_frame.nodes.append(node)
  123. if (
  124. not (
  125. isinstance(node, parsetree.ControlLine)
  126. and control_frame.is_ternary(node.keyword)
  127. )
  128. and self.ternary_stack
  129. and self.ternary_stack[-1]
  130. ):
  131. self.ternary_stack[-1][-1].nodes.append(node)
  132. if isinstance(node, parsetree.Tag):
  133. if len(self.tag):
  134. node.parent = self.tag[-1]
  135. self.tag.append(node)
  136. elif isinstance(node, parsetree.ControlLine):
  137. if node.isend:
  138. self.control_line.pop()
  139. self.ternary_stack.pop()
  140. elif node.is_primary:
  141. self.control_line.append(node)
  142. self.ternary_stack.append([])
  143. elif self.control_line and self.control_line[-1].is_ternary(
  144. node.keyword
  145. ):
  146. self.ternary_stack[-1].append(node)
  147. elif self.control_line and not self.control_line[-1].is_ternary(
  148. node.keyword
  149. ):
  150. raise exceptions.SyntaxException(
  151. "Keyword '%s' not a legal ternary for keyword '%s'"
  152. % (node.keyword, self.control_line[-1].keyword),
  153. **self.exception_kwargs,
  154. )
  155. _coding_re = re.compile(r"#.*coding[:=]\s*([-\w.]+).*\r?\n")
  156. def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
  157. """given string/unicode or bytes/string, determine encoding
  158. from magic encoding comment, return body as unicode
  159. or raw if decode_raw=False
  160. """
  161. if isinstance(text, str):
  162. m = self._coding_re.match(text)
  163. encoding = m and m.group(1) or known_encoding or "utf-8"
  164. return encoding, text
  165. if text.startswith(codecs.BOM_UTF8):
  166. text = text[len(codecs.BOM_UTF8) :]
  167. parsed_encoding = "utf-8"
  168. m = self._coding_re.match(text.decode("utf-8", "ignore"))
  169. if m is not None and m.group(1) != "utf-8":
  170. raise exceptions.CompileException(
  171. "Found utf-8 BOM in file, with conflicting "
  172. "magic encoding comment of '%s'" % m.group(1),
  173. text.decode("utf-8", "ignore"),
  174. 0,
  175. 0,
  176. filename,
  177. )
  178. else:
  179. m = self._coding_re.match(text.decode("utf-8", "ignore"))
  180. parsed_encoding = m.group(1) if m else known_encoding or "utf-8"
  181. if decode_raw:
  182. try:
  183. text = text.decode(parsed_encoding)
  184. except UnicodeDecodeError:
  185. raise exceptions.CompileException(
  186. "Unicode decode operation of encoding '%s' failed"
  187. % parsed_encoding,
  188. text.decode("utf-8", "ignore"),
  189. 0,
  190. 0,
  191. filename,
  192. )
  193. return parsed_encoding, text
  194. def parse(self):
  195. self.encoding, self.text = self.decode_raw_stream(
  196. self.text, True, self.encoding, self.filename
  197. )
  198. for preproc in self.preprocessor:
  199. self.text = preproc(self.text)
  200. # push the match marker past the
  201. # encoding comment.
  202. self.match_reg(self._coding_re)
  203. self.textlength = len(self.text)
  204. while True:
  205. if self.match_position > self.textlength:
  206. break
  207. if self.match_end():
  208. break
  209. if self.match_expression():
  210. continue
  211. if self.match_control_line():
  212. continue
  213. if self.match_comment():
  214. continue
  215. if self.match_tag_start():
  216. continue
  217. if self.match_tag_end():
  218. continue
  219. if self.match_python_block():
  220. continue
  221. if self.match_percent():
  222. continue
  223. if self.match_text():
  224. continue
  225. if self.match_position > self.textlength:
  226. break
  227. # TODO: no coverage here
  228. raise exceptions.MakoException("assertion failed")
  229. if len(self.tag):
  230. raise exceptions.SyntaxException(
  231. "Unclosed tag: <%%%s>" % self.tag[-1].keyword,
  232. **self.exception_kwargs,
  233. )
  234. if len(self.control_line):
  235. raise exceptions.SyntaxException(
  236. "Unterminated control keyword: '%s'"
  237. % self.control_line[-1].keyword,
  238. self.text,
  239. self.control_line[-1].lineno,
  240. self.control_line[-1].pos,
  241. self.filename,
  242. )
  243. return self.template
  244. def match_tag_start(self):
  245. reg = r"""
  246. \<% # opening tag
  247. ([\w\.\:]+) # keyword
  248. ((?:\s+\w+|\s*=\s*|"[^"]*?"|'[^']*?'|\s*,\s*)*) # attrname, = \
  249. # sign, string expression
  250. # comma is for backwards compat
  251. # identified in #366
  252. \s* # more whitespace
  253. (/)?> # closing
  254. """
  255. match = self.match(
  256. reg,
  257. re.I | re.S | re.X,
  258. )
  259. if not match:
  260. return False
  261. keyword, attr, isend = match.groups()
  262. self.keyword = keyword
  263. attributes = {}
  264. if attr:
  265. for att in re.findall(
  266. r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr
  267. ):
  268. key, val1, val2 = att
  269. text = val1 or val2
  270. text = text.replace("\r\n", "\n")
  271. attributes[key] = text
  272. self.append_node(parsetree.Tag, keyword, attributes)
  273. if isend:
  274. self.tag.pop()
  275. elif keyword == "text":
  276. match = self.match(r"(.*?)(?=\</%text>)", re.S)
  277. if not match:
  278. raise exceptions.SyntaxException(
  279. "Unclosed tag: <%%%s>" % self.tag[-1].keyword,
  280. **self.exception_kwargs,
  281. )
  282. self.append_node(parsetree.Text, match.group(1))
  283. return self.match_tag_end()
  284. return True
  285. def match_tag_end(self):
  286. match = self.match(r"\</%[\t ]*([^\t ]+?)[\t ]*>")
  287. if match:
  288. if not len(self.tag):
  289. raise exceptions.SyntaxException(
  290. "Closing tag without opening tag: </%%%s>"
  291. % match.group(1),
  292. **self.exception_kwargs,
  293. )
  294. elif self.tag[-1].keyword != match.group(1):
  295. raise exceptions.SyntaxException(
  296. "Closing tag </%%%s> does not match tag: <%%%s>"
  297. % (match.group(1), self.tag[-1].keyword),
  298. **self.exception_kwargs,
  299. )
  300. self.tag.pop()
  301. return True
  302. else:
  303. return False
  304. def match_end(self):
  305. match = self.match(r"\Z", re.S)
  306. if not match:
  307. return False
  308. string = match.group()
  309. if string:
  310. return string
  311. else:
  312. return True
  313. def match_percent(self):
  314. match = self.match(r"(?<=^)(\s*)%%(%*)", re.M)
  315. if match:
  316. self.append_node(
  317. parsetree.Text, match.group(1) + "%" + match.group(2)
  318. )
  319. return True
  320. else:
  321. return False
  322. def match_text(self):
  323. match = self.match(
  324. r"""
  325. (.*?) # anything, followed by:
  326. (
  327. (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
  328. # comment, preceded by a
  329. # consumed newline and whitespace
  330. |
  331. (?=\${) # an expression
  332. |
  333. (?=</?%) # a substitution or block or call start or end
  334. # - don't consume
  335. |
  336. (\\\r?\n) # an escaped newline - throw away
  337. |
  338. \Z # end of string
  339. )""",
  340. re.X | re.S,
  341. )
  342. if match:
  343. text = match.group(1)
  344. if text:
  345. self.append_node(parsetree.Text, text)
  346. return True
  347. else:
  348. return False
  349. def match_python_block(self):
  350. match = self.match(r"<%(!)?")
  351. if match:
  352. line, pos = self.matched_lineno, self.matched_charpos
  353. text, end = self.parse_until_text(False, r"%>")
  354. # the trailing newline helps
  355. # compiler.parse() not complain about indentation
  356. text = adjust_whitespace(text) + "\n"
  357. self.append_node(
  358. parsetree.Code,
  359. text,
  360. match.group(1) == "!",
  361. lineno=line,
  362. pos=pos,
  363. )
  364. return True
  365. else:
  366. return False
  367. def match_expression(self):
  368. match = self.match(r"\${")
  369. if not match:
  370. return False
  371. line, pos = self.matched_lineno, self.matched_charpos
  372. text, end = self.parse_until_text(True, r"\|", r"}")
  373. if end == "|":
  374. escapes, end = self.parse_until_text(True, r"}")
  375. else:
  376. escapes = ""
  377. text = text.replace("\r\n", "\n")
  378. self.append_node(
  379. parsetree.Expression,
  380. text,
  381. escapes.strip(),
  382. lineno=line,
  383. pos=pos,
  384. )
  385. return True
  386. def match_control_line(self):
  387. match = self.match(
  388. r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\\r?\n)|[^\r\n])*)"
  389. r"(?:\r?\n|\Z)",
  390. re.M,
  391. )
  392. if not match:
  393. return False
  394. operator = match.group(1)
  395. text = match.group(2)
  396. if operator == "%":
  397. m2 = re.match(r"(end)?(\w+)\s*(.*)", text)
  398. if not m2:
  399. raise exceptions.SyntaxException(
  400. "Invalid control line: '%s'" % text,
  401. **self.exception_kwargs,
  402. )
  403. isend, keyword = m2.group(1, 2)
  404. isend = isend is not None
  405. if isend:
  406. if not len(self.control_line):
  407. raise exceptions.SyntaxException(
  408. "No starting keyword '%s' for '%s'" % (keyword, text),
  409. **self.exception_kwargs,
  410. )
  411. elif self.control_line[-1].keyword != keyword:
  412. raise exceptions.SyntaxException(
  413. "Keyword '%s' doesn't match keyword '%s'"
  414. % (text, self.control_line[-1].keyword),
  415. **self.exception_kwargs,
  416. )
  417. self.append_node(parsetree.ControlLine, keyword, isend, text)
  418. else:
  419. self.append_node(parsetree.Comment, text)
  420. return True
  421. def match_comment(self):
  422. """matches the multiline version of a comment"""
  423. match = self.match(r"<%doc>(.*?)</%doc>", re.S)
  424. if match:
  425. self.append_node(parsetree.Comment, match.group(1))
  426. return True
  427. else:
  428. return False