parser.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. import codecs
  2. import re
  3. from typing import (IO, Iterator, Match, NamedTuple, Optional, # noqa:F401
  4. Pattern, Sequence, Tuple)
  5. def make_regex(string: str, extra_flags: int = 0) -> Pattern[str]:
  6. return re.compile(string, re.UNICODE | extra_flags)
  7. _newline = make_regex(r"(\r\n|\n|\r)")
  8. _multiline_whitespace = make_regex(r"\s*", extra_flags=re.MULTILINE)
  9. _whitespace = make_regex(r"[^\S\r\n]*")
  10. _export = make_regex(r"(?:export[^\S\r\n]+)?")
  11. _single_quoted_key = make_regex(r"'([^']+)'")
  12. _unquoted_key = make_regex(r"([^=\#\s]+)")
  13. _equal_sign = make_regex(r"(=[^\S\r\n]*)")
  14. _single_quoted_value = make_regex(r"'((?:\\'|[^'])*)'")
  15. _double_quoted_value = make_regex(r'"((?:\\"|[^"])*)"')
  16. _unquoted_value = make_regex(r"([^\r\n]*)")
  17. _comment = make_regex(r"(?:[^\S\r\n]*#[^\r\n]*)?")
  18. _end_of_line = make_regex(r"[^\S\r\n]*(?:\r\n|\n|\r|$)")
  19. _rest_of_line = make_regex(r"[^\r\n]*(?:\r|\n|\r\n)?")
  20. _double_quote_escapes = make_regex(r"\\[\\'\"abfnrtv]")
  21. _single_quote_escapes = make_regex(r"\\[\\']")
  22. class Original(NamedTuple):
  23. string: str
  24. line: int
  25. class Binding(NamedTuple):
  26. key: Optional[str]
  27. value: Optional[str]
  28. original: Original
  29. error: bool
  30. class Position:
  31. def __init__(self, chars: int, line: int) -> None:
  32. self.chars = chars
  33. self.line = line
  34. @classmethod
  35. def start(cls) -> "Position":
  36. return cls(chars=0, line=1)
  37. def set(self, other: "Position") -> None:
  38. self.chars = other.chars
  39. self.line = other.line
  40. def advance(self, string: str) -> None:
  41. self.chars += len(string)
  42. self.line += len(re.findall(_newline, string))
  43. class Error(Exception):
  44. pass
  45. class Reader:
  46. def __init__(self, stream: IO[str]) -> None:
  47. self.string = stream.read()
  48. self.position = Position.start()
  49. self.mark = Position.start()
  50. def has_next(self) -> bool:
  51. return self.position.chars < len(self.string)
  52. def set_mark(self) -> None:
  53. self.mark.set(self.position)
  54. def get_marked(self) -> Original:
  55. return Original(
  56. string=self.string[self.mark.chars:self.position.chars],
  57. line=self.mark.line,
  58. )
  59. def peek(self, count: int) -> str:
  60. return self.string[self.position.chars:self.position.chars + count]
  61. def read(self, count: int) -> str:
  62. result = self.string[self.position.chars:self.position.chars + count]
  63. if len(result) < count:
  64. raise Error("read: End of string")
  65. self.position.advance(result)
  66. return result
  67. def read_regex(self, regex: Pattern[str]) -> Sequence[str]:
  68. match = regex.match(self.string, self.position.chars)
  69. if match is None:
  70. raise Error("read_regex: Pattern not found")
  71. self.position.advance(self.string[match.start():match.end()])
  72. return match.groups()
  73. def decode_escapes(regex: Pattern[str], string: str) -> str:
  74. def decode_match(match: Match[str]) -> str:
  75. return codecs.decode(match.group(0), 'unicode-escape') # type: ignore
  76. return regex.sub(decode_match, string)
  77. def parse_key(reader: Reader) -> Optional[str]:
  78. char = reader.peek(1)
  79. if char == "#":
  80. return None
  81. elif char == "'":
  82. (key,) = reader.read_regex(_single_quoted_key)
  83. else:
  84. (key,) = reader.read_regex(_unquoted_key)
  85. return key
  86. def parse_unquoted_value(reader: Reader) -> str:
  87. (part,) = reader.read_regex(_unquoted_value)
  88. return re.sub(r"\s+#.*", "", part).rstrip()
  89. def parse_value(reader: Reader) -> str:
  90. char = reader.peek(1)
  91. if char == u"'":
  92. (value,) = reader.read_regex(_single_quoted_value)
  93. return decode_escapes(_single_quote_escapes, value)
  94. elif char == u'"':
  95. (value,) = reader.read_regex(_double_quoted_value)
  96. return decode_escapes(_double_quote_escapes, value)
  97. elif char in (u"", u"\n", u"\r"):
  98. return u""
  99. else:
  100. return parse_unquoted_value(reader)
  101. def parse_binding(reader: Reader) -> Binding:
  102. reader.set_mark()
  103. try:
  104. reader.read_regex(_multiline_whitespace)
  105. if not reader.has_next():
  106. return Binding(
  107. key=None,
  108. value=None,
  109. original=reader.get_marked(),
  110. error=False,
  111. )
  112. reader.read_regex(_export)
  113. key = parse_key(reader)
  114. reader.read_regex(_whitespace)
  115. if reader.peek(1) == "=":
  116. reader.read_regex(_equal_sign)
  117. value: Optional[str] = parse_value(reader)
  118. else:
  119. value = None
  120. reader.read_regex(_comment)
  121. reader.read_regex(_end_of_line)
  122. return Binding(
  123. key=key,
  124. value=value,
  125. original=reader.get_marked(),
  126. error=False,
  127. )
  128. except Error:
  129. reader.read_regex(_rest_of_line)
  130. return Binding(
  131. key=None,
  132. value=None,
  133. original=reader.get_marked(),
  134. error=True,
  135. )
  136. def parse_stream(stream: IO[str]) -> Iterator[Binding]:
  137. reader = Reader(stream)
  138. while reader.has_next():
  139. yield parse_binding(reader)