parser.ts 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. /**
  2. * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author David Sehnal <david.sehnal@gmail.com>
  5. */
  6. /**
  7. * mmCIF parser.
  8. *
  9. * Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
  10. *
  11. * Differences I'm aware of:
  12. * - Except keywords (data_, loop_, save_) everything is case sensitive.
  13. * - The tokens . and ? are treated the same as the values '.' and '?'.
  14. * - Ignores \ in the multiline values:
  15. * ;abc\
  16. * efg
  17. * ;
  18. * should have the value 'abcefg' but will have the value 'abc\\nefg' instead.
  19. * Post processing of this is left to the consumer of the data.
  20. * - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed.
  21. *
  22. */
  23. import * as Data from '../data-model'
  24. import { Tokens, TokenBuilder } from '../../common/text/tokenizer'
  25. import { ReaderResult as Result } from '../../result'
  26. import { Task, RuntimeContext, chunkedSubtask } from 'mol-task'
  27. /**
  28. * Types of supported mmCIF tokens.
  29. */
  30. const enum CifTokenType {
  31. Data = 0,
  32. Save = 1,
  33. Loop = 2,
  34. Value = 3,
  35. ColumnName = 4,
  36. Comment = 5,
  37. End = 6
  38. }
  39. interface TokenizerState {
  40. data: string,
  41. position: number,
  42. length: number,
  43. isEscaped: boolean,
  44. lineNumber: number,
  45. tokenType: CifTokenType,
  46. tokenStart: number,
  47. tokenEnd: number,
  48. runtimeCtx: RuntimeContext
  49. }
  50. /**
  51. * Eat everything until a whitespace/newline occurs.
  52. */
  53. function eatValue(state: TokenizerState) {
  54. while (state.position < state.length) {
  55. switch (state.data.charCodeAt(state.position)) {
  56. case 9: // \t
  57. case 10: // \n
  58. case 13: // \r
  59. case 32: // ' '
  60. state.tokenEnd = state.position;
  61. return;
  62. default:
  63. ++state.position;
  64. break;
  65. }
  66. }
  67. state.tokenEnd = state.position;
  68. }
  69. /**
  70. * Eats an escaped value. Handles the "degenerate" cases as well.
  71. *
  72. * "Degenerate" cases:
  73. * - 'xx'x' => xx'x
  74. * - 'xxxNEWLINE => 'xxx
  75. *
  76. */
  77. function eatEscaped(state: TokenizerState, esc: number) {
  78. let next: number, c: number;
  79. ++state.position;
  80. while (state.position < state.length) {
  81. c = state.data.charCodeAt(state.position);
  82. if (c === esc) {
  83. next = state.data.charCodeAt(state.position + 1);
  84. switch (next) {
  85. case 9: // \t
  86. case 10: // \n
  87. case 13: // \r
  88. case 32: // ' '
  89. // get rid of the quotes.
  90. state.tokenStart++;
  91. state.tokenEnd = state.position;
  92. state.isEscaped = true;
  93. ++state.position;
  94. return;
  95. default:
  96. if (next === void 0) { // = "end of stream"
  97. // get rid of the quotes.
  98. state.tokenStart++;
  99. state.tokenEnd = state.position;
  100. state.isEscaped = true;
  101. ++state.position;
  102. return;
  103. }
  104. ++state.position;
  105. break;
  106. }
  107. } else {
  108. // handle 'xxxNEWLINE => 'xxx
  109. if (c === 10 || c === 13) {
  110. state.tokenEnd = state.position;
  111. return;
  112. }
  113. ++state.position;
  114. }
  115. }
  116. state.tokenEnd = state.position;
  117. }
  118. /**
  119. * Eats a multiline token of the form NL;....NL;
  120. */
  121. function eatMultiline(state: TokenizerState) {
  122. let prev = 59, pos = state.position + 1, c: number;
  123. while (pos < state.length) {
  124. c = state.data.charCodeAt(pos);
  125. if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r
  126. state.position = pos + 1;
  127. // get rid of the ;
  128. state.tokenStart++;
  129. // remove trailing newlines
  130. pos--;
  131. c = state.data.charCodeAt(pos);
  132. while (c === 10 || c === 13) {
  133. pos--;
  134. c = state.data.charCodeAt(pos);
  135. }
  136. state.tokenEnd = pos + 1;
  137. state.isEscaped = true;
  138. return;
  139. } else {
  140. // handle line numbers
  141. if (c === 13) { // \r
  142. state.lineNumber++;
  143. } else if (c === 10 && prev !== 13) { // \r\n
  144. state.lineNumber++;
  145. }
  146. prev = c;
  147. ++pos;
  148. }
  149. }
  150. state.position = pos;
  151. return prev;
  152. }
  153. /**
  154. * Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function.
  155. */
  156. function skipCommentLine(state: TokenizerState) {
  157. while (state.position < state.length) {
  158. let c = state.data.charCodeAt(state.position);
  159. if (c === 10 || c === 13) {
  160. return;
  161. }
  162. ++state.position;
  163. }
  164. }
  165. /**
  166. * Skips all the whitespace - space, tab, newline, CR
  167. * Handles incrementing line count.
  168. */
  169. function skipWhitespace(state: TokenizerState): number {
  170. let prev = 10;
  171. while (state.position < state.length) {
  172. let c = state.data.charCodeAt(state.position);
  173. switch (c) {
  174. case 9: // '\t'
  175. case 32: // ' '
  176. prev = c;
  177. ++state.position;
  178. break;
  179. case 10: // \n
  180. // handle \r\n
  181. if (prev !== 13) {
  182. ++state.lineNumber;
  183. }
  184. prev = c;
  185. ++state.position;
  186. break;
  187. case 13: // \r
  188. prev = c;
  189. ++state.position;
  190. ++state.lineNumber;
  191. break;
  192. default:
  193. return prev;
  194. }
  195. }
  196. return prev;
  197. }
  198. function isData(state: TokenizerState): boolean {
  199. // here we already assume the 5th char is _ and that the length >= 5
  200. // d/D
  201. let c = state.data.charCodeAt(state.tokenStart);
  202. if (c !== 68 && c !== 100) return false;
  203. // a/A
  204. c = state.data.charCodeAt(state.tokenStart + 1);
  205. if (c !== 65 && c !== 97) return false;
  206. // t/t
  207. c = state.data.charCodeAt(state.tokenStart + 2);
  208. if (c !== 84 && c !== 116) return false;
  209. // a/A
  210. c = state.data.charCodeAt(state.tokenStart + 3);
  211. if (c !== 65 && c !== 97) return false;
  212. return true;
  213. }
  214. function isSave(state: TokenizerState): boolean {
  215. // here we already assume the 5th char is _ and that the length >= 5
  216. // s/S
  217. let c = state.data.charCodeAt(state.tokenStart);
  218. if (c !== 83 && c !== 115) return false;
  219. // a/A
  220. c = state.data.charCodeAt(state.tokenStart + 1);
  221. if (c !== 65 && c !== 97) return false;
  222. // v/V
  223. c = state.data.charCodeAt(state.tokenStart + 2);
  224. if (c !== 86 && c !== 118) return false;
  225. // e/E
  226. c = state.data.charCodeAt(state.tokenStart + 3);
  227. if (c !== 69 && c !== 101) return false;
  228. return true;
  229. }
  230. function isLoop(state: TokenizerState): boolean {
  231. // here we already assume the 5th char is _ and that the length >= 5
  232. if (state.tokenEnd - state.tokenStart !== 5) return false;
  233. // l/L
  234. let c = state.data.charCodeAt(state.tokenStart);
  235. if (c !== 76 && c !== 108) return false;
  236. // o/O
  237. c = state.data.charCodeAt(state.tokenStart + 1);
  238. if (c !== 79 && c !== 111) return false;
  239. // o/O
  240. c = state.data.charCodeAt(state.tokenStart + 2);
  241. if (c !== 79 && c !== 111) return false;
  242. // p/P
  243. c = state.data.charCodeAt(state.tokenStart + 3);
  244. if (c !== 80 && c !== 112) return false;
  245. return true;
  246. }
  247. /**
  248. * Checks if the current token shares the namespace with string at <start,end).
  249. */
  250. function isNamespace(state: TokenizerState, start: number, end: number): boolean {
  251. let i: number,
  252. nsLen = end - start,
  253. offset = state.tokenStart - start,
  254. tokenLen = state.tokenEnd - state.tokenStart;
  255. if (tokenLen < nsLen) return false;
  256. for (i = start; i < end; ++i) {
  257. if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset)) return false;
  258. }
  259. if (nsLen === tokenLen) return true;
  260. if (state.data.charCodeAt(i + offset) === 46) { // .
  261. return true;
  262. }
  263. return false;
  264. }
  265. /**
  266. * Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd.
  267. */
  268. function getNamespaceEnd(state: TokenizerState): number {
  269. let i: number;
  270. for (i = state.tokenStart; i < state.tokenEnd; ++i) {
  271. if (state.data.charCodeAt(i) === 46) return i;
  272. }
  273. return i;
  274. }
  275. /**
  276. * Get the namespace string. endIndex is obtained by the getNamespaceEnd() function.
  277. */
  278. function getNamespace(state: TokenizerState, endIndex: number) {
  279. return state.data.substring(state.tokenStart, endIndex);
  280. }
  281. /**
  282. * String representation of the current token.
  283. */
  284. function getTokenString(state: TokenizerState) {
  285. return state.data.substring(state.tokenStart, state.tokenEnd);
  286. }
  287. /**
  288. * Move to the next token.
  289. */
  290. function moveNextInternal(state: TokenizerState) {
  291. let prev = skipWhitespace(state);
  292. if (state.position >= state.length) {
  293. state.tokenType = CifTokenType.End;
  294. return;
  295. }
  296. state.tokenStart = state.position;
  297. state.tokenEnd = state.position;
  298. state.isEscaped = false;
  299. let c = state.data.charCodeAt(state.position);
  300. switch (c) {
  301. case 35: // #, comment
  302. skipCommentLine(state);
  303. state.tokenType = CifTokenType.Comment;
  304. break;
  305. case 34: // ", escaped value
  306. case 39: // ', escaped value
  307. eatEscaped(state, c);
  308. state.tokenType = CifTokenType.Value;
  309. break;
  310. case 59: // ;, possible multiline value
  311. // multiline value must start at the beginning of the line.
  312. if (prev === 10 || prev === 13) { // /n or /r
  313. eatMultiline(state);
  314. } else {
  315. eatValue(state);
  316. }
  317. state.tokenType = CifTokenType.Value;
  318. break;
  319. default:
  320. eatValue(state);
  321. // escaped is always Value
  322. if (state.isEscaped) {
  323. state.tokenType = CifTokenType.Value;
  324. // _ always means column name
  325. } else if (state.data.charCodeAt(state.tokenStart) === 95) { // _
  326. state.tokenType = CifTokenType.ColumnName;
  327. // 5th char needs to be _ for data_ or loop_
  328. } else if (state.tokenEnd - state.tokenStart >= 5 && state.data.charCodeAt(state.tokenStart + 4) === 95) {
  329. if (isData(state)) state.tokenType = CifTokenType.Data;
  330. else if (isSave(state)) state.tokenType = CifTokenType.Save;
  331. else if (isLoop(state)) state.tokenType = CifTokenType.Loop;
  332. else state.tokenType = CifTokenType.Value;
  333. // all other tests failed, we are at Value token.
  334. } else {
  335. state.tokenType = CifTokenType.Value;
  336. }
  337. break;
  338. }
  339. }
  340. /**
  341. * Moves to the next non-comment token.
  342. */
  343. function moveNext(state: TokenizerState) {
  344. moveNextInternal(state);
  345. while (state.tokenType === CifTokenType.Comment) moveNextInternal(state);
  346. }
  347. function createTokenizer(data: string, runtimeCtx: RuntimeContext): TokenizerState {
  348. return {
  349. data,
  350. length: data.length,
  351. position: 0,
  352. tokenStart: 0,
  353. tokenEnd: 0,
  354. tokenType: CifTokenType.End,
  355. lineNumber: 1,
  356. isEscaped: false,
  357. runtimeCtx
  358. };
  359. }
  360. /**
  361. * Helper shape of the category result.
  362. */
  363. interface CifCategoryResult {
  364. hasError: boolean;
  365. errorLine: number;
  366. errorMessage: string;
  367. }
  368. type FrameContext = {
  369. categoryNames: string[],
  370. categories: { [name: string]: Data.CifCategory }
  371. }
  372. function FrameContext(): FrameContext {
  373. return { categoryNames: [], categories: Object.create(null) };
  374. }
  375. /**
  376. * Reads a category containing a single row.
  377. */
  378. function handleSingle(tokenizer: TokenizerState, ctx: FrameContext): CifCategoryResult {
  379. const nsStart = tokenizer.tokenStart, nsEnd = getNamespaceEnd(tokenizer);
  380. const name = getNamespace(tokenizer, nsEnd);
  381. const fields = Object.create(null);
  382. const fieldNames: string[] = [];
  383. let readingNames = true;
  384. while (readingNames) {
  385. if (tokenizer.tokenType !== CifTokenType.ColumnName || !isNamespace(tokenizer, nsStart, nsEnd)) {
  386. readingNames = false;
  387. break;
  388. }
  389. const fieldName = getTokenString(tokenizer).substring(name.length + 1);
  390. moveNext(tokenizer);
  391. if (tokenizer.tokenType as any !== CifTokenType.Value) {
  392. return {
  393. hasError: true,
  394. errorLine: tokenizer.lineNumber,
  395. errorMessage: 'Expected value.'
  396. }
  397. }
  398. fields[fieldName] = Data.CifField.ofTokens({ data: tokenizer.data, indices: [tokenizer.tokenStart, tokenizer.tokenEnd], count: 1 });
  399. fieldNames[fieldNames.length] = fieldName;
  400. moveNext(tokenizer);
  401. }
  402. const catName = name.substr(1);
  403. ctx.categories[catName] = Data.CifCategory(catName, 1, fieldNames, fields);
  404. ctx.categoryNames.push(catName);
  405. return {
  406. hasError: false,
  407. errorLine: 0,
  408. errorMessage: ''
  409. };
  410. }
  411. interface LoopReadState {
  412. tokenizer: TokenizerState,
  413. tokens: Tokens[],
  414. fieldCount: number,
  415. tokenCount: number
  416. }
  417. function readLoopChunk(chunkSize: number, state: LoopReadState) {
  418. const { tokenizer, tokens, fieldCount } = state;
  419. let tokenCount = state.tokenCount;
  420. let counter = 0;
  421. while (tokenizer.tokenType === CifTokenType.Value && counter < chunkSize) {
  422. TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.tokenStart, tokenizer.tokenEnd);
  423. moveNext(tokenizer);
  424. counter++;
  425. }
  426. state.tokenCount = tokenCount;
  427. return counter;
  428. }
  429. function updateLoopChunk(ctx: RuntimeContext, state: LoopReadState) {
  430. return ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length });
  431. }
  432. // const readLoopChunks = ChunkedSubtask(1000000,
  433. // (size, state: LoopReadState) => readLoopChunk(state, size),
  434. // (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length }));
  435. /**
  436. * Reads a loop.
  437. */
  438. async function handleLoop(tokenizer: TokenizerState, ctx: FrameContext): Promise<CifCategoryResult> {
  439. const loopLine = tokenizer.lineNumber;
  440. moveNext(tokenizer);
  441. const name = getNamespace(tokenizer, getNamespaceEnd(tokenizer));
  442. const fieldNames: string[] = [];
  443. while (tokenizer.tokenType === CifTokenType.ColumnName) {
  444. fieldNames[fieldNames.length] = getTokenString(tokenizer).substring(name.length + 1);
  445. moveNext(tokenizer);
  446. }
  447. const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
  448. const tokens: Tokens[] = [];
  449. const fieldCount = fieldNames.length;
  450. for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer.data, rowCountEstimate);
  451. const state: LoopReadState = {
  452. fieldCount,
  453. tokenCount: 0,
  454. tokenizer,
  455. tokens
  456. };
  457. await chunkedSubtask(tokenizer.runtimeCtx, 1000000, state, readLoopChunk, updateLoopChunk);
  458. if (state.tokenCount % fieldCount !== 0) {
  459. return {
  460. hasError: true,
  461. errorLine: tokenizer.lineNumber,
  462. errorMessage: `The number of values for loop starting at line ${loopLine} is not a multiple of the number of columns.`
  463. };
  464. }
  465. const rowCount = (state.tokenCount / fieldCount) | 0;
  466. const fields = Object.create(null);
  467. for (let i = 0; i < fieldCount; i++) {
  468. fields[fieldNames[i]] = Data.CifField.ofTokens(tokens[i]);
  469. }
  470. const catName = name.substr(1);
  471. ctx.categories[catName] = Data.CifCategory(catName, rowCount, fieldNames, fields);
  472. ctx.categoryNames.push(catName);
  473. return {
  474. hasError: false,
  475. errorLine: 0,
  476. errorMessage: ''
  477. };
  478. }
  479. /**
  480. * Creates an error result.
  481. */
  482. function error(line: number, message: string) {
  483. return Result.error<Data.CifFile>(message, line);
  484. }
  485. /**
  486. * Creates a data result.
  487. */
  488. function result(data: Data.CifFile) {
  489. return Result.success(data);
  490. }
  491. /**
  492. * Parses an mmCIF file.
  493. *
  494. * @returns CifParserResult wrapper of the result.
  495. */
  496. async function parseInternal(data: string, runtimeCtx: RuntimeContext) {
  497. const dataBlocks: Data.CifBlock[] = [];
  498. const tokenizer = createTokenizer(data, runtimeCtx);
  499. let blockHeader = '';
  500. let blockCtx = FrameContext();
  501. let inSaveFrame = false;
  502. // the next three initial values are never used in valid files
  503. let saveFrames: Data.CifFrame[] = [];
  504. let saveCtx = FrameContext();
  505. let saveFrame: Data.CifFrame = Data.CifSafeFrame(saveCtx.categoryNames, saveCtx.categories, '');
  506. runtimeCtx.update({ message: 'Parsing...', current: 0, max: data.length });
  507. moveNext(tokenizer);
  508. while (tokenizer.tokenType !== CifTokenType.End) {
  509. let token = tokenizer.tokenType;
  510. // Data block
  511. if (token === CifTokenType.Data) {
  512. if (inSaveFrame) {
  513. return error(tokenizer.lineNumber, 'Unexpected data block inside a save frame.');
  514. }
  515. if (blockCtx.categoryNames.length > 0) {
  516. dataBlocks.push(Data.CifBlock(blockCtx.categoryNames, blockCtx.categories, blockHeader, saveFrames));
  517. }
  518. blockHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  519. blockCtx = FrameContext();
  520. saveFrames = []
  521. moveNext(tokenizer);
  522. // Save frame
  523. } else if (token === CifTokenType.Save) {
  524. const saveHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  525. if (saveHeader.length === 0) {
  526. if (saveCtx.categoryNames.length > 0) {
  527. saveFrames[saveFrames.length] = saveFrame;
  528. }
  529. inSaveFrame = false;
  530. } else {
  531. if (inSaveFrame) {
  532. return error(tokenizer.lineNumber, 'Save frames cannot be nested.');
  533. }
  534. inSaveFrame = true;
  535. const safeHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  536. saveCtx = FrameContext();
  537. saveFrame = Data.CifSafeFrame(saveCtx.categoryNames, saveCtx.categories, safeHeader);
  538. }
  539. moveNext(tokenizer);
  540. // Loop
  541. } else if (token === CifTokenType.Loop) {
  542. const cat = await handleLoop(tokenizer, inSaveFrame ? saveCtx : blockCtx);
  543. if (cat.hasError) {
  544. return error(cat.errorLine, cat.errorMessage);
  545. }
  546. // Single row
  547. } else if (token === CifTokenType.ColumnName) {
  548. const cat = handleSingle(tokenizer, inSaveFrame ? saveCtx : blockCtx);
  549. if (cat.hasError) {
  550. return error(cat.errorLine, cat.errorMessage);
  551. }
  552. // Out of options
  553. } else {
  554. return error(tokenizer.lineNumber, 'Unexpected token. Expected data_, loop_, or data name.');
  555. }
  556. }
  557. // Check if the latest save frame was closed.
  558. if (inSaveFrame) {
  559. return error(tokenizer.lineNumber, `Unfinished save frame (${saveFrame.header}).`);
  560. }
  561. if (blockCtx.categoryNames.length > 0 || saveFrames.length > 0) {
  562. dataBlocks.push(Data.CifBlock(blockCtx.categoryNames, blockCtx.categories, blockHeader, saveFrames));
  563. }
  564. return result(Data.CifFile(dataBlocks));
  565. }
  566. export default function parse(data: string) {
  567. return Task.create<Result<Data.CifFile>>('Parse CIF', async ctx => {
  568. return await parseInternal(data, ctx);
  569. });
  570. }