parser.ts 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. /**
  2. * Copyright (c) 2017 molio contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author David Sehnal <david.sehnal@gmail.com>
  5. */
  6. /**
  7. * mmCIF parser.
  8. *
  9. * Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
  10. *
  11. * Differences I'm aware of:
  12. * - Except keywords (data_, loop_, save_) everything is case sensitive.
  13. * - The tokens . and ? are treated the same as the values '.' and '?'.
  14. * - Ignores \ in the multiline values:
  15. * ;abc\
  16. * efg
  17. * ;
  18. * should have the value 'abcefg' but will have the value 'abc\\nefg' instead.
  19. * Post processing of this is left to the consumer of the data.
  20. * - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed.
  21. *
  22. */
  23. import * as Data from '../data-model'
  24. import Field from './field'
  25. import { Tokens, TokenBuilder } from '../../common/text/tokenizer'
  26. import Result from '../../result'
  27. import Computation from '../../../utils/computation'
  28. /**
  29. * Types of supported mmCIF tokens.
  30. */
  31. const enum CifTokenType {
  32. Data = 0,
  33. Save = 1,
  34. Loop = 2,
  35. Value = 3,
  36. ColumnName = 4,
  37. Comment = 5,
  38. End = 6
  39. }
  40. interface TokenizerState {
  41. data: string;
  42. position: number;
  43. length: number;
  44. isEscaped: boolean;
  45. lineNumber: number;
  46. tokenType: CifTokenType;
  47. tokenStart: number;
  48. tokenEnd: number;
  49. chunker: Computation.Chunker
  50. }
  51. /**
  52. * Eat everything until a whitespace/newline occurs.
  53. */
  54. function eatValue(state: TokenizerState) {
  55. while (state.position < state.length) {
  56. switch (state.data.charCodeAt(state.position)) {
  57. case 9: // \t
  58. case 10: // \n
  59. case 13: // \r
  60. case 32: // ' '
  61. state.tokenEnd = state.position;
  62. return;
  63. default:
  64. ++state.position;
  65. break;
  66. }
  67. }
  68. state.tokenEnd = state.position;
  69. }
  70. /**
  71. * Eats an escaped values. Handles the "degenerate" cases as well.
  72. *
  73. * "Degenerate" cases:
  74. * - 'xx'x' => xx'x
  75. * - 'xxxNEWLINE => 'xxx
  76. *
  77. */
  78. function eatEscaped(state: TokenizerState, esc: number) {
  79. let next: number, c: number;
  80. ++state.position;
  81. while (state.position < state.length) {
  82. c = state.data.charCodeAt(state.position);
  83. if (c === esc) {
  84. next = state.data.charCodeAt(state.position + 1);
  85. switch (next) {
  86. case 9: // \t
  87. case 10: // \n
  88. case 13: // \r
  89. case 32: // ' '
  90. // get rid of the quotes.
  91. state.tokenStart++;
  92. state.tokenEnd = state.position;
  93. state.isEscaped = true;
  94. ++state.position;
  95. return;
  96. default:
  97. if (next === void 0) { // = "end of stream"
  98. // get rid of the quotes.
  99. state.tokenStart++;
  100. state.tokenEnd = state.position;
  101. state.isEscaped = true;
  102. ++state.position;
  103. return;
  104. }
  105. ++state.position;
  106. break;
  107. }
  108. } else {
  109. // handle 'xxxNEWLINE => 'xxx
  110. if (c === 10 || c === 13) {
  111. state.tokenEnd = state.position;
  112. return;
  113. }
  114. ++state.position;
  115. }
  116. }
  117. state.tokenEnd = state.position;
  118. }
  119. /**
  120. * Eats a multiline token of the form NL;....NL;
  121. */
  122. function eatMultiline(state: TokenizerState) {
  123. let prev = 59, pos = state.position + 1, c: number;
  124. while (pos < state.length) {
  125. c = state.data.charCodeAt(pos);
  126. if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r
  127. state.position = pos + 1;
  128. // get rid of the ;
  129. state.tokenStart++;
  130. // remove trailing newlines
  131. pos--;
  132. c = state.data.charCodeAt(pos);
  133. while (c === 10 || c === 13) {
  134. pos--;
  135. c = state.data.charCodeAt(pos);
  136. }
  137. state.tokenEnd = pos + 1;
  138. state.isEscaped = true;
  139. return;
  140. } else {
  141. // handle line numbers
  142. if (c === 13) { // \r
  143. state.lineNumber++;
  144. } else if (c === 10 && prev !== 13) { // \r\n
  145. state.lineNumber++;
  146. }
  147. prev = c;
  148. ++pos;
  149. }
  150. }
  151. state.position = pos;
  152. return prev;
  153. }
  154. /**
  155. * Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function.
  156. */
  157. function skipCommentLine(state: TokenizerState) {
  158. while (state.position < state.length) {
  159. let c = state.data.charCodeAt(state.position);
  160. if (c === 10 || c === 13) {
  161. return;
  162. }
  163. ++state.position;
  164. }
  165. }
  166. /**
  167. * Skips all the whitespace - space, tab, newline, CR
  168. * Handles incrementing line count.
  169. */
  170. function skipWhitespace(state: TokenizerState): number {
  171. let prev = 10;
  172. while (state.position < state.length) {
  173. let c = state.data.charCodeAt(state.position);
  174. switch (c) {
  175. case 9: // '\t'
  176. case 32: // ' '
  177. prev = c;
  178. ++state.position;
  179. break;
  180. case 10: // \n
  181. // handle \r\n
  182. if (prev !== 13) {
  183. ++state.lineNumber;
  184. }
  185. prev = c;
  186. ++state.position;
  187. break;
  188. case 13: // \r
  189. prev = c;
  190. ++state.position;
  191. ++state.lineNumber;
  192. break;
  193. default:
  194. return prev;
  195. }
  196. }
  197. return prev;
  198. }
  199. function isData(state: TokenizerState): boolean {
  200. // here we already assume the 5th char is _ and that the length >= 5
  201. // d/D
  202. let c = state.data.charCodeAt(state.tokenStart);
  203. if (c !== 68 && c !== 100) return false;
  204. // a/A
  205. c = state.data.charCodeAt(state.tokenStart + 1);
  206. if (c !== 65 && c !== 97) return false;
  207. // t/t
  208. c = state.data.charCodeAt(state.tokenStart + 2);
  209. if (c !== 84 && c !== 116) return false;
  210. // a/A
  211. c = state.data.charCodeAt(state.tokenStart + 3);
  212. if (c !== 65 && c !== 97) return false;
  213. return true;
  214. }
  215. function isSave(state: TokenizerState): boolean {
  216. // here we already assume the 5th char is _ and that the length >= 5
  217. // s/S
  218. let c = state.data.charCodeAt(state.tokenStart);
  219. if (c !== 83 && c !== 115) return false;
  220. // a/A
  221. c = state.data.charCodeAt(state.tokenStart + 1);
  222. if (c !== 65 && c !== 97) return false;
  223. // v/V
  224. c = state.data.charCodeAt(state.tokenStart + 2);
  225. if (c !== 86 && c !== 118) return false;
  226. // e/E
  227. c = state.data.charCodeAt(state.tokenStart + 3);
  228. if (c !== 69 && c !== 101) return false;
  229. return true;
  230. }
  231. function isLoop(state: TokenizerState): boolean {
  232. // here we already assume the 5th char is _ and that the length >= 5
  233. if (state.tokenEnd - state.tokenStart !== 5) return false;
  234. // l/L
  235. let c = state.data.charCodeAt(state.tokenStart);
  236. if (c !== 76 && c !== 108) return false;
  237. // o/O
  238. c = state.data.charCodeAt(state.tokenStart + 1);
  239. if (c !== 79 && c !== 111) return false;
  240. // o/O
  241. c = state.data.charCodeAt(state.tokenStart + 2);
  242. if (c !== 79 && c !== 111) return false;
  243. // p/P
  244. c = state.data.charCodeAt(state.tokenStart + 3);
  245. if (c !== 80 && c !== 112) return false;
  246. return true;
  247. }
  248. /**
  249. * Checks if the current token shares the namespace with string at <start,end).
  250. */
  251. function isNamespace(state: TokenizerState, start: number, end: number): boolean {
  252. let i: number,
  253. nsLen = end - start,
  254. offset = state.tokenStart - start,
  255. tokenLen = state.tokenEnd - state.tokenStart;
  256. if (tokenLen < nsLen) return false;
  257. for (i = start; i < end; ++i) {
  258. if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset)) return false;
  259. }
  260. if (nsLen === tokenLen) return true;
  261. if (state.data.charCodeAt(i + offset) === 46) { // .
  262. return true;
  263. }
  264. return false;
  265. }
  266. /**
  267. * Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd.
  268. */
  269. function getNamespaceEnd(state: TokenizerState): number {
  270. let i: number;
  271. for (i = state.tokenStart; i < state.tokenEnd; ++i) {
  272. if (state.data.charCodeAt(i) === 46) return i;
  273. }
  274. return i;
  275. }
  276. /**
  277. * Get the namespace string. endIndex is obtained by the getNamespaceEnd() function.
  278. */
  279. function getNamespace(state: TokenizerState, endIndex: number) {
  280. return state.data.substring(state.tokenStart, endIndex);
  281. }
  282. /**
  283. * String representation of the current token.
  284. */
  285. function getTokenString(state: TokenizerState) {
  286. return state.data.substring(state.tokenStart, state.tokenEnd);
  287. }
  288. /**
  289. * Move to the next token.
  290. */
  291. function moveNextInternal(state: TokenizerState) {
  292. let prev = skipWhitespace(state);
  293. if (state.position >= state.length) {
  294. state.tokenType = CifTokenType.End;
  295. return;
  296. }
  297. state.tokenStart = state.position;
  298. state.tokenEnd = state.position;
  299. state.isEscaped = false;
  300. let c = state.data.charCodeAt(state.position);
  301. switch (c) {
  302. case 35: // #, comment
  303. skipCommentLine(state);
  304. state.tokenType = CifTokenType.Comment;
  305. break;
  306. case 34: // ", escaped value
  307. case 39: // ', escaped value
  308. eatEscaped(state, c);
  309. state.tokenType = CifTokenType.Value;
  310. break;
  311. case 59: // ;, possible multiline value
  312. // multiline value must start at the beginning of the line.
  313. if (prev === 10 || prev === 13) { // /n or /r
  314. eatMultiline(state);
  315. } else {
  316. eatValue(state);
  317. }
  318. state.tokenType = CifTokenType.Value;
  319. break;
  320. default:
  321. eatValue(state);
  322. // escaped is always Value
  323. if (state.isEscaped) {
  324. state.tokenType = CifTokenType.Value;
  325. // _ always means column name
  326. } else if (state.data.charCodeAt(state.tokenStart) === 95) { // _
  327. state.tokenType = CifTokenType.ColumnName;
  328. // 5th char needs to be _ for data_ or loop_
  329. } else if (state.tokenEnd - state.tokenStart >= 5 && state.data.charCodeAt(state.tokenStart + 4) === 95) {
  330. if (isData(state)) state.tokenType = CifTokenType.Data;
  331. else if (isSave(state)) state.tokenType = CifTokenType.Save;
  332. else if (isLoop(state)) state.tokenType = CifTokenType.Loop;
  333. else state.tokenType = CifTokenType.Value;
  334. // all other tests failed, we are at Value token.
  335. } else {
  336. state.tokenType = CifTokenType.Value;
  337. }
  338. break;
  339. }
  340. }
  341. /**
  342. * Moves to the next non-comment token.
  343. */
  344. function moveNext(state: TokenizerState) {
  345. moveNextInternal(state);
  346. while (state.tokenType === CifTokenType.Comment) moveNextInternal(state);
  347. }
  348. function createTokenizer(data: string, ctx: Computation.Context): TokenizerState {
  349. return {
  350. data,
  351. length: data.length,
  352. position: 0,
  353. tokenStart: 0,
  354. tokenEnd: 0,
  355. tokenType: CifTokenType.End,
  356. lineNumber: 1,
  357. isEscaped: false,
  358. chunker: Computation.chunker(ctx, 1000000)
  359. };
  360. }
  361. /**
  362. * Helper shape of the category result.
  363. */
  364. interface CifCategoryResult {
  365. hasError: boolean;
  366. errorLine: number;
  367. errorMessage: string;
  368. }
  369. /**
  370. * Reads a category containing a single row.
  371. */
  372. function handleSingle(tokenizer: TokenizerState, categories: { [name: string]: Data.Category }): CifCategoryResult {
  373. const nsStart = tokenizer.tokenStart, nsEnd = getNamespaceEnd(tokenizer);
  374. const name = getNamespace(tokenizer, nsEnd);
  375. const fields = Object.create(null);
  376. let readingNames = true;
  377. while (readingNames) {
  378. if (tokenizer.tokenType !== CifTokenType.ColumnName || !isNamespace(tokenizer, nsStart, nsEnd)) {
  379. readingNames = false;
  380. break;
  381. }
  382. const fieldName = getTokenString(tokenizer).substring(name.length + 1);
  383. moveNext(tokenizer);
  384. if (tokenizer.tokenType as any !== CifTokenType.Value) {
  385. return {
  386. hasError: true,
  387. errorLine: tokenizer.lineNumber,
  388. errorMessage: 'Expected value.'
  389. }
  390. }
  391. fields[fieldName] = Field({ data: tokenizer.data, indices: [tokenizer.tokenStart, tokenizer.tokenEnd], count: 1 }, 1);
  392. moveNext(tokenizer);
  393. }
  394. categories[name] = Data.Category(1, fields);
  395. return {
  396. hasError: false,
  397. errorLine: 0,
  398. errorMessage: ''
  399. };
  400. }
  401. interface LoopReadState {
  402. tokenizer: TokenizerState,
  403. tokens: Tokens[],
  404. fieldCount: number,
  405. tokenCount: number
  406. }
  407. function readLoopChunk(state: LoopReadState, chunkSize: number) {
  408. const { tokenizer, tokens, fieldCount } = state;
  409. let tokenCount = state.tokenCount;
  410. let counter = 0;
  411. while (tokenizer.tokenType === CifTokenType.Value && counter < chunkSize) {
  412. TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.tokenStart, tokenizer.tokenEnd);
  413. moveNext(tokenizer);
  414. counter++;
  415. }
  416. state.tokenCount = tokenCount;
  417. return counter;
  418. }
  419. function readLoopChunks(state: LoopReadState) {
  420. return state.tokenizer.chunker.process(
  421. chunkSize => readLoopChunk(state, chunkSize),
  422. update => update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length }));
  423. }
  424. /**
  425. * Reads a loop.
  426. */
  427. async function handleLoop(tokenizer: TokenizerState, categories: { [name: string]: Data.Category }): Promise<CifCategoryResult> {
  428. const loopLine = tokenizer.lineNumber;
  429. moveNext(tokenizer);
  430. const name = getNamespace(tokenizer, getNamespaceEnd(tokenizer));
  431. const fieldNames: string[] = [];
  432. while (tokenizer.tokenType === CifTokenType.ColumnName) {
  433. fieldNames[fieldNames.length] = getTokenString(tokenizer).substring(name.length + 1);
  434. moveNext(tokenizer);
  435. }
  436. const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
  437. const tokens: Tokens[] = [];
  438. const fieldCount = fieldNames.length;
  439. for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer, rowCountEstimate);
  440. const state: LoopReadState = {
  441. fieldCount,
  442. tokenCount: 0,
  443. tokenizer,
  444. tokens
  445. };
  446. await readLoopChunks(state);
  447. if (state.tokenCount % fieldCount !== 0) {
  448. return {
  449. hasError: true,
  450. errorLine: tokenizer.lineNumber,
  451. errorMessage: 'The number of values for loop starting at line ' + loopLine + ' is not a multiple of the number of columns.'
  452. };
  453. }
  454. const rowCount = (state.tokenCount / fieldCount) | 0;
  455. const fields = Object.create(null);
  456. for (let i = 0; i < fieldCount; i++) {
  457. fields[fieldNames[i]] = Field(tokens[i], rowCount);
  458. }
  459. categories[name] = Data.Category(rowCount, fields);
  460. return {
  461. hasError: false,
  462. errorLine: 0,
  463. errorMessage: ''
  464. };
  465. }
  466. /**
  467. * Creates an error result.
  468. */
  469. function error(line: number, message: string) {
  470. return Result.error<Data.File>(message, line);
  471. }
  472. /**
  473. * Creates a data result.
  474. */
  475. function result(data: Data.File) {
  476. return Result.success(data);
  477. }
  478. /**
  479. * Parses an mmCIF file.
  480. *
  481. * @returns CifParserResult wrapper of the result.
  482. */
  483. async function parseInternal(data: string, ctx: Computation.Context) {
  484. const dataBlocks: Data.Block[] = [];
  485. const tokenizer = createTokenizer(data, ctx);
  486. let blockHeader: string = '';
  487. let blockCategories = Object.create(null);
  488. let inSaveFrame = false
  489. // the next three initial values are never used in valid files
  490. let saveFrames: Data.SafeFrame[] = [];
  491. let saveCategories = Object.create(null);
  492. let saveFrame: Data.SafeFrame = Data.SafeFrame(saveCategories, '');
  493. ctx.update({ message: 'Parsing...', current: 0, max: data.length });
  494. moveNext(tokenizer);
  495. while (tokenizer.tokenType !== CifTokenType.End) {
  496. let token = tokenizer.tokenType;
  497. // Data block
  498. if (token === CifTokenType.Data) {
  499. if (inSaveFrame) {
  500. return error(tokenizer.lineNumber, "Unexpected data block inside a save frame.");
  501. }
  502. if (Object.keys(blockCategories).length > 0) {
  503. dataBlocks.push(Data.Block(blockCategories, blockHeader, saveFrames));
  504. }
  505. blockHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  506. blockCategories = Object.create(null);
  507. saveFrames = []
  508. moveNext(tokenizer);
  509. // Save frame
  510. } else if (token === CifTokenType.Save) {
  511. const saveHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  512. if (saveHeader.length === 0) {
  513. if (Object.keys(saveCategories).length > 0) {
  514. saveFrames[saveFrames.length] = saveFrame
  515. }
  516. inSaveFrame = false;
  517. } else {
  518. if (inSaveFrame) {
  519. return error(tokenizer.lineNumber, "Save frames cannot be nested.");
  520. }
  521. inSaveFrame = true;
  522. saveCategories = Object.create(null);
  523. saveFrame = Data.SafeFrame(saveCategories, saveHeader);
  524. }
  525. moveNext(tokenizer);
  526. // Loop
  527. } else if (token === CifTokenType.Loop) {
  528. const cat = await handleLoop(tokenizer, inSaveFrame ? saveCategories : blockCategories);
  529. if (cat.hasError) {
  530. return error(cat.errorLine, cat.errorMessage);
  531. }
  532. // Single row
  533. } else if (token === CifTokenType.ColumnName) {
  534. const cat = handleSingle(tokenizer, inSaveFrame ? saveCategories : blockCategories);
  535. if (cat.hasError) {
  536. return error(cat.errorLine, cat.errorMessage);
  537. }
  538. // Out of options
  539. } else {
  540. return error(tokenizer.lineNumber, 'Unexpected token. Expected data_, loop_, or data name.');
  541. }
  542. }
  543. // Check if the latest save frame was closed.
  544. if (inSaveFrame) {
  545. return error(tokenizer.lineNumber, "Unfinished save frame (`" + saveFrame.header + "`).");
  546. }
  547. if (Object.keys(blockCategories).length > 0) {
  548. dataBlocks.push(Data.Block(blockCategories, blockHeader, saveFrames));
  549. }
  550. return result(Data.File(dataBlocks));
  551. }
  552. export default function parse(data: string) {
  553. return Computation.create<Result<Data.File>>(async ctx => {
  554. return await parseInternal(data, ctx);
  555. });
  556. }