Tokenizer.js 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. 'use strict';
  2. var CssSyntaxError = require('./error');
  3. var constants = require('./const');
  4. var TYPE = constants.TYPE;
  5. var NAME = constants.NAME;
  6. var SYMBOL_TYPE = constants.SYMBOL_TYPE;
  7. var utils = require('./utils');
  8. var firstCharOffset = utils.firstCharOffset;
  9. var cmpStr = utils.cmpStr;
  10. var isNumber = utils.isNumber;
  11. var findWhiteSpaceStart = utils.findWhiteSpaceStart;
  12. var findWhiteSpaceEnd = utils.findWhiteSpaceEnd;
  13. var findCommentEnd = utils.findCommentEnd;
  14. var findStringEnd = utils.findStringEnd;
  15. var findNumberEnd = utils.findNumberEnd;
  16. var findIdentifierEnd = utils.findIdentifierEnd;
  17. var findUrlRawEnd = utils.findUrlRawEnd;
  18. var NULL = 0;
  19. var WHITESPACE = TYPE.WhiteSpace;
  20. var IDENTIFIER = TYPE.Identifier;
  21. var NUMBER = TYPE.Number;
  22. var STRING = TYPE.String;
  23. var COMMENT = TYPE.Comment;
  24. var PUNCTUATOR = TYPE.Punctuator;
  25. var CDO = TYPE.CDO;
  26. var CDC = TYPE.CDC;
  27. var ATRULE = TYPE.Atrule;
  28. var FUNCTION = TYPE.Function;
  29. var URL = TYPE.Url;
  30. var RAW = TYPE.Raw;
  31. var N = 10;
  32. var F = 12;
  33. var R = 13;
  34. var STAR = TYPE.Asterisk;
  35. var SLASH = TYPE.Solidus;
  36. var FULLSTOP = TYPE.FullStop;
  37. var PLUSSIGN = TYPE.PlusSign;
  38. var HYPHENMINUS = TYPE.HyphenMinus;
  39. var GREATERTHANSIGN = TYPE.GreaterThanSign;
  40. var LESSTHANSIGN = TYPE.LessThanSign;
  41. var EXCLAMATIONMARK = TYPE.ExclamationMark;
  42. var COMMERCIALAT = TYPE.CommercialAt;
  43. var QUOTATIONMARK = TYPE.QuotationMark;
  44. var APOSTROPHE = TYPE.Apostrophe;
  45. var LEFTPARENTHESIS = TYPE.LeftParenthesis;
  46. var RIGHTPARENTHESIS = TYPE.RightParenthesis;
  47. var LEFTCURLYBRACKET = TYPE.LeftCurlyBracket;
  48. var RIGHTCURLYBRACKET = TYPE.RightCurlyBracket;
  49. var LEFTSQUAREBRACKET = TYPE.LeftSquareBracket;
  50. var RIGHTSQUAREBRACKET = TYPE.RightSquareBracket;
  51. var MIN_BUFFER_SIZE = 16 * 1024;
  52. var OFFSET_MASK = 0x00FFFFFF;
  53. var TYPE_SHIFT = 24;
  54. var SafeUint32Array = typeof Uint32Array !== 'undefined' ? Uint32Array : Array; // fallback on Array when TypedArray is not supported
  55. function computeLinesAndColumns(tokenizer, source) {
  56. var sourceLength = source.length;
  57. var start = firstCharOffset(source);
  58. var lines = tokenizer.lines;
  59. var line = tokenizer.startLine;
  60. var columns = tokenizer.columns;
  61. var column = tokenizer.startColumn;
  62. if (lines === null || lines.length < sourceLength + 1) {
  63. lines = new SafeUint32Array(Math.max(sourceLength + 1024, MIN_BUFFER_SIZE));
  64. columns = new SafeUint32Array(lines.length);
  65. }
  66. for (var i = start; i < sourceLength; i++) {
  67. var code = source.charCodeAt(i);
  68. lines[i] = line;
  69. columns[i] = column++;
  70. if (code === N || code === R || code === F) {
  71. if (code === R && i + 1 < sourceLength && source.charCodeAt(i + 1) === N) {
  72. i++;
  73. lines[i] = line;
  74. columns[i] = column;
  75. }
  76. line++;
  77. column = 1;
  78. }
  79. }
  80. lines[i] = line;
  81. columns[i] = column;
  82. tokenizer.linesAnsColumnsComputed = true;
  83. tokenizer.lines = lines;
  84. tokenizer.columns = columns;
  85. }
  86. function tokenLayout(tokenizer, source, startPos) {
  87. var sourceLength = source.length;
  88. var offsetAndType = tokenizer.offsetAndType;
  89. var balance = tokenizer.balance;
  90. var tokenCount = 0;
  91. var prevType = 0;
  92. var offset = startPos;
  93. var anchor = 0;
  94. var balanceCloseCode = 0;
  95. var balanceStart = 0;
  96. var balancePrev = 0;
  97. if (offsetAndType === null || offsetAndType.length < sourceLength + 1) {
  98. offsetAndType = new SafeUint32Array(sourceLength + 1024);
  99. balance = new SafeUint32Array(sourceLength + 1024);
  100. }
  101. while (offset < sourceLength) {
  102. var code = source.charCodeAt(offset);
  103. var type = code < 0x80 ? SYMBOL_TYPE[code] : IDENTIFIER;
  104. balance[tokenCount] = sourceLength;
  105. switch (type) {
  106. case WHITESPACE:
  107. offset = findWhiteSpaceEnd(source, offset + 1);
  108. break;
  109. case PUNCTUATOR:
  110. switch (code) {
  111. case balanceCloseCode:
  112. balancePrev = balanceStart & OFFSET_MASK;
  113. balanceStart = balance[balancePrev];
  114. balanceCloseCode = balanceStart >> TYPE_SHIFT;
  115. balance[tokenCount] = balancePrev;
  116. balance[balancePrev++] = tokenCount;
  117. for (; balancePrev < tokenCount; balancePrev++) {
  118. if (balance[balancePrev] === sourceLength) {
  119. balance[balancePrev] = tokenCount;
  120. }
  121. }
  122. break;
  123. case LEFTSQUAREBRACKET:
  124. balance[tokenCount] = balanceStart;
  125. balanceCloseCode = RIGHTSQUAREBRACKET;
  126. balanceStart = (balanceCloseCode << TYPE_SHIFT) | tokenCount;
  127. break;
  128. case LEFTCURLYBRACKET:
  129. balance[tokenCount] = balanceStart;
  130. balanceCloseCode = RIGHTCURLYBRACKET;
  131. balanceStart = (balanceCloseCode << TYPE_SHIFT) | tokenCount;
  132. break;
  133. case LEFTPARENTHESIS:
  134. balance[tokenCount] = balanceStart;
  135. balanceCloseCode = RIGHTPARENTHESIS;
  136. balanceStart = (balanceCloseCode << TYPE_SHIFT) | tokenCount;
  137. break;
  138. }
  139. // /*
  140. if (code === STAR && prevType === SLASH) {
  141. type = COMMENT;
  142. offset = findCommentEnd(source, offset + 1);
  143. tokenCount--; // rewrite prev token
  144. break;
  145. }
  146. // edge case for -.123 and +.123
  147. if (code === FULLSTOP && (prevType === PLUSSIGN || prevType === HYPHENMINUS)) {
  148. if (offset + 1 < sourceLength && isNumber(source.charCodeAt(offset + 1))) {
  149. type = NUMBER;
  150. offset = findNumberEnd(source, offset + 2, false);
  151. tokenCount--; // rewrite prev token
  152. break;
  153. }
  154. }
  155. // <!--
  156. if (code === EXCLAMATIONMARK && prevType === LESSTHANSIGN) {
  157. if (offset + 2 < sourceLength &&
  158. source.charCodeAt(offset + 1) === HYPHENMINUS &&
  159. source.charCodeAt(offset + 2) === HYPHENMINUS) {
  160. type = CDO;
  161. offset = offset + 3;
  162. tokenCount--; // rewrite prev token
  163. break;
  164. }
  165. }
  166. // -->
  167. if (code === HYPHENMINUS && prevType === HYPHENMINUS) {
  168. if (offset + 1 < sourceLength && source.charCodeAt(offset + 1) === GREATERTHANSIGN) {
  169. type = CDC;
  170. offset = offset + 2;
  171. tokenCount--; // rewrite prev token
  172. break;
  173. }
  174. }
  175. // ident(
  176. if (code === LEFTPARENTHESIS && prevType === IDENTIFIER) {
  177. offset = offset + 1;
  178. tokenCount--; // rewrite prev token
  179. balance[tokenCount] = balance[tokenCount + 1];
  180. balanceStart--;
  181. // 4 char length identifier and equal to `url(` (case insensitive)
  182. if (offset - anchor === 4 && cmpStr(source, anchor, offset, 'url(')) {
  183. // special case for url() because it can contain any symbols sequence with few exceptions
  184. anchor = findWhiteSpaceEnd(source, offset);
  185. code = source.charCodeAt(anchor);
  186. if (code !== LEFTPARENTHESIS &&
  187. code !== RIGHTPARENTHESIS &&
  188. code !== QUOTATIONMARK &&
  189. code !== APOSTROPHE) {
  190. // url(
  191. offsetAndType[tokenCount++] = (URL << TYPE_SHIFT) | offset;
  192. balance[tokenCount] = sourceLength;
  193. // ws*
  194. if (anchor !== offset) {
  195. offsetAndType[tokenCount++] = (WHITESPACE << TYPE_SHIFT) | anchor;
  196. balance[tokenCount] = sourceLength;
  197. }
  198. // raw
  199. type = RAW;
  200. offset = findUrlRawEnd(source, anchor);
  201. } else {
  202. type = URL;
  203. }
  204. } else {
  205. type = FUNCTION;
  206. }
  207. break;
  208. }
  209. type = code;
  210. offset = offset + 1;
  211. break;
  212. case NUMBER:
  213. offset = findNumberEnd(source, offset + 1, prevType !== FULLSTOP);
  214. // merge number with a preceding dot, dash or plus
  215. if (prevType === FULLSTOP ||
  216. prevType === HYPHENMINUS ||
  217. prevType === PLUSSIGN) {
  218. tokenCount--; // rewrite prev token
  219. }
  220. break;
  221. case STRING:
  222. offset = findStringEnd(source, offset + 1, code);
  223. break;
  224. default:
  225. anchor = offset;
  226. offset = findIdentifierEnd(source, offset);
  227. // merge identifier with a preceding dash
  228. if (prevType === HYPHENMINUS) {
  229. // rewrite prev token
  230. tokenCount--;
  231. // restore prev prev token type
  232. // for case @-prefix-ident
  233. prevType = tokenCount === 0 ? 0 : offsetAndType[tokenCount - 1] >> TYPE_SHIFT;
  234. }
  235. if (prevType === COMMERCIALAT) {
  236. // rewrite prev token and change type to <at-keyword-token>
  237. tokenCount--;
  238. type = ATRULE;
  239. }
  240. }
  241. offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset;
  242. prevType = type;
  243. }
  244. // finalize arrays
  245. offsetAndType[tokenCount] = offset;
  246. balance[tokenCount] = sourceLength;
  247. balance[sourceLength] = sourceLength; // prevents false positive balance match with any token
  248. while (balanceStart !== 0) {
  249. balancePrev = balanceStart & OFFSET_MASK;
  250. balanceStart = balance[balancePrev];
  251. balance[balancePrev] = sourceLength;
  252. }
  253. tokenizer.offsetAndType = offsetAndType;
  254. tokenizer.tokenCount = tokenCount;
  255. tokenizer.balance = balance;
  256. }
  257. //
  258. // tokenizer
  259. //
  260. var Tokenizer = function(source, startOffset, startLine, startColumn) {
  261. this.offsetAndType = null;
  262. this.balance = null;
  263. this.lines = null;
  264. this.columns = null;
  265. this.setSource(source, startOffset, startLine, startColumn);
  266. };
  267. Tokenizer.prototype = {
  268. setSource: function(source, startOffset, startLine, startColumn) {
  269. var safeSource = String(source || '');
  270. var start = firstCharOffset(safeSource);
  271. this.source = safeSource;
  272. this.firstCharOffset = start;
  273. this.startOffset = typeof startOffset === 'undefined' ? 0 : startOffset;
  274. this.startLine = typeof startLine === 'undefined' ? 1 : startLine;
  275. this.startColumn = typeof startColumn === 'undefined' ? 1 : startColumn;
  276. this.linesAnsColumnsComputed = false;
  277. this.eof = false;
  278. this.currentToken = -1;
  279. this.tokenType = 0;
  280. this.tokenStart = start;
  281. this.tokenEnd = start;
  282. tokenLayout(this, safeSource, start);
  283. this.next();
  284. },
  285. lookupType: function(offset) {
  286. offset += this.currentToken;
  287. if (offset < this.tokenCount) {
  288. return this.offsetAndType[offset] >> TYPE_SHIFT;
  289. }
  290. return NULL;
  291. },
  292. lookupNonWSType: function(offset) {
  293. offset += this.currentToken;
  294. for (var type; offset < this.tokenCount; offset++) {
  295. type = this.offsetAndType[offset] >> TYPE_SHIFT;
  296. if (type !== WHITESPACE) {
  297. return type;
  298. }
  299. }
  300. return NULL;
  301. },
  302. lookupValue: function(offset, referenceStr) {
  303. offset += this.currentToken;
  304. if (offset < this.tokenCount) {
  305. return cmpStr(
  306. this.source,
  307. this.offsetAndType[offset - 1] & OFFSET_MASK,
  308. this.offsetAndType[offset] & OFFSET_MASK,
  309. referenceStr
  310. );
  311. }
  312. return false;
  313. },
  314. getTokenStart: function(tokenNum) {
  315. if (tokenNum === this.currentToken) {
  316. return this.tokenStart;
  317. }
  318. if (tokenNum > 0) {
  319. return tokenNum < this.tokenCount
  320. ? this.offsetAndType[tokenNum - 1] & OFFSET_MASK
  321. : this.offsetAndType[this.tokenCount] & OFFSET_MASK;
  322. }
  323. return this.firstCharOffset;
  324. },
  325. getOffsetExcludeWS: function() {
  326. if (this.currentToken > 0) {
  327. if ((this.offsetAndType[this.currentToken - 1] >> TYPE_SHIFT) === WHITESPACE) {
  328. return this.currentToken > 1
  329. ? this.offsetAndType[this.currentToken - 2] & OFFSET_MASK
  330. : this.firstCharOffset;
  331. }
  332. }
  333. return this.tokenStart;
  334. },
  335. getRawLength: function(startToken, endTokenType1, endTokenType2, includeTokenType2) {
  336. var cursor = startToken;
  337. var balanceEnd;
  338. loop:
  339. for (; cursor < this.tokenCount; cursor++) {
  340. balanceEnd = this.balance[cursor];
  341. // belance end points to offset before start
  342. if (balanceEnd < startToken) {
  343. break loop;
  344. }
  345. // check token is stop type
  346. switch (this.offsetAndType[cursor] >> TYPE_SHIFT) {
  347. case endTokenType1:
  348. break loop;
  349. case endTokenType2:
  350. if (includeTokenType2) {
  351. cursor++;
  352. }
  353. break loop;
  354. default:
  355. // fast forward to the end of balanced block
  356. if (this.balance[balanceEnd] === cursor) {
  357. cursor = balanceEnd;
  358. }
  359. }
  360. }
  361. return cursor - this.currentToken;
  362. },
  363. isBalanceEdge: function(pos) {
  364. var balanceStart = this.balance[this.currentToken];
  365. return balanceStart < pos;
  366. },
  367. getTokenValue: function() {
  368. return this.source.substring(this.tokenStart, this.tokenEnd);
  369. },
  370. substrToCursor: function(start) {
  371. return this.source.substring(start, this.tokenStart);
  372. },
  373. skipWS: function() {
  374. for (var i = this.currentToken, skipTokenCount = 0; i < this.tokenCount; i++, skipTokenCount++) {
  375. if ((this.offsetAndType[i] >> TYPE_SHIFT) !== WHITESPACE) {
  376. break;
  377. }
  378. }
  379. if (skipTokenCount > 0) {
  380. this.skip(skipTokenCount);
  381. }
  382. },
  383. skipSC: function() {
  384. while (this.tokenType === WHITESPACE || this.tokenType === COMMENT) {
  385. this.next();
  386. }
  387. },
  388. skip: function(tokenCount) {
  389. var next = this.currentToken + tokenCount;
  390. if (next < this.tokenCount) {
  391. this.currentToken = next;
  392. this.tokenStart = this.offsetAndType[next - 1] & OFFSET_MASK;
  393. next = this.offsetAndType[next];
  394. this.tokenType = next >> TYPE_SHIFT;
  395. this.tokenEnd = next & OFFSET_MASK;
  396. } else {
  397. this.currentToken = this.tokenCount;
  398. this.next();
  399. }
  400. },
  401. next: function() {
  402. var next = this.currentToken + 1;
  403. if (next < this.tokenCount) {
  404. this.currentToken = next;
  405. this.tokenStart = this.tokenEnd;
  406. next = this.offsetAndType[next];
  407. this.tokenType = next >> TYPE_SHIFT;
  408. this.tokenEnd = next & OFFSET_MASK;
  409. } else {
  410. this.currentToken = this.tokenCount;
  411. this.eof = true;
  412. this.tokenType = NULL;
  413. this.tokenStart = this.tokenEnd = this.source.length;
  414. }
  415. },
  416. eat: function(tokenType) {
  417. if (this.tokenType !== tokenType) {
  418. var offset = this.tokenStart;
  419. var message = NAME[tokenType] + ' is expected';
  420. // tweak message and offset
  421. if (tokenType === IDENTIFIER) {
  422. // when identifier is expected but there is a function or url
  423. if (this.tokenType === FUNCTION || this.tokenType === URL) {
  424. offset = this.tokenEnd - 1;
  425. message += ' but function found';
  426. }
  427. } else {
  428. // when test type is part of another token show error for current position + 1
  429. // e.g. eat(HYPHENMINUS) will fail on "-foo", but pointing on "-" is odd
  430. if (this.source.charCodeAt(this.tokenStart) === tokenType) {
  431. offset = offset + 1;
  432. }
  433. }
  434. this.error(message, offset);
  435. }
  436. this.next();
  437. },
  438. eatNonWS: function(tokenType) {
  439. this.skipWS();
  440. this.eat(tokenType);
  441. },
  442. consume: function(tokenType) {
  443. var value = this.getTokenValue();
  444. this.eat(tokenType);
  445. return value;
  446. },
  447. consumeFunctionName: function() {
  448. var name = this.source.substring(this.tokenStart, this.tokenEnd - 1);
  449. this.eat(FUNCTION);
  450. return name;
  451. },
  452. consumeNonWS: function(tokenType) {
  453. this.skipWS();
  454. return this.consume(tokenType);
  455. },
  456. expectIdentifier: function(name) {
  457. if (this.tokenType !== IDENTIFIER || cmpStr(this.source, this.tokenStart, this.tokenEnd, name) === false) {
  458. this.error('Identifier `' + name + '` is expected');
  459. }
  460. this.next();
  461. },
  462. getLocation: function(offset, filename) {
  463. if (!this.linesAnsColumnsComputed) {
  464. computeLinesAndColumns(this, this.source);
  465. }
  466. return {
  467. source: filename,
  468. offset: this.startOffset + offset,
  469. line: this.lines[offset],
  470. column: this.columns[offset]
  471. };
  472. },
  473. getLocationRange: function(start, end, filename) {
  474. if (!this.linesAnsColumnsComputed) {
  475. computeLinesAndColumns(this, this.source);
  476. }
  477. return {
  478. source: filename,
  479. start: {
  480. offset: this.startOffset + start,
  481. line: this.lines[start],
  482. column: this.columns[start]
  483. },
  484. end: {
  485. offset: this.startOffset + end,
  486. line: this.lines[end],
  487. column: this.columns[end]
  488. }
  489. };
  490. },
  491. error: function(message, offset) {
  492. var location = typeof offset !== 'undefined' && offset < this.source.length
  493. ? this.getLocation(offset)
  494. : this.eof
  495. ? this.getLocation(findWhiteSpaceStart(this.source, this.source.length - 1))
  496. : this.getLocation(this.tokenStart);
  497. throw new CssSyntaxError(
  498. message || 'Unexpected input',
  499. this.source,
  500. location.offset,
  501. location.line,
  502. location.column
  503. );
  504. },
  505. dump: function() {
  506. var offset = 0;
  507. return Array.prototype.slice.call(this.offsetAndType, 0, this.tokenCount).map(function(item, idx) {
  508. var start = offset;
  509. var end = item & OFFSET_MASK;
  510. offset = end;
  511. return {
  512. idx: idx,
  513. type: NAME[item >> TYPE_SHIFT],
  514. chunk: this.source.substring(start, end),
  515. balance: this.balance[idx]
  516. };
  517. }, this);
  518. }
  519. };
  520. // extend with error class
  521. Tokenizer.CssSyntaxError = CssSyntaxError;
  522. // extend tokenizer with constants
  523. Object.keys(constants).forEach(function(key) {
  524. Tokenizer[key] = constants[key];
  525. });
  526. // extend tokenizer with static methods from utils
  527. Object.keys(utils).forEach(function(key) {
  528. Tokenizer[key] = utils[key];
  529. });
  530. // warm up tokenizer to elimitate code branches that never execute
  531. // fix soft deoptimizations (insufficient type feedback)
  532. new Tokenizer('\n\r\r\n\f<!---->//""\'\'/*\r\n\f*/1a;.\\31\t\+2{url(a);func();+1.2e3 -.4e-5 .6e+7}').getLocation();
  533. module.exports = Tokenizer;