parser.ts 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. /**
  2. * Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author David Sehnal <david.sehnal@gmail.com>
  5. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  6. */
  7. /**
  8. * mmCIF parser.
  9. *
  10. * Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
  11. *
  12. * Differences I'm aware of:
  13. * - Except keywords (data_, loop_, save_) everything is case sensitive.
  14. * - The tokens . and ? are treated the same as the values '.' and '?'.
  15. * - Ignores \ in the multiline values:
  16. * ;abc\
  17. * efg
  18. * ;
  19. * should have the value 'abcefg' but will have the value 'abc\\nefg' instead.
  20. * Post processing of this is left to the consumer of the data.
  21. * - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed.
  22. *
  23. */
  24. import * as Data from '../data-model';
  25. import { Tokens, TokenBuilder, Tokenizer } from '../../common/text/tokenizer';
  26. import { ReaderResult as Result } from '../../result';
  27. import { Task, RuntimeContext, chunkedSubtask } from '../../../../mol-task';
  28. /**
  29. * Types of supported mmCIF tokens.
  30. */
  31. const enum CifTokenType {
  32. Data = 0,
  33. Save = 1,
  34. Loop = 2,
  35. Value = 3,
  36. ColumnName = 4,
  37. Comment = 5,
  38. End = 6
  39. }
  40. interface TokenizerState {
  41. data: string,
  42. position: number,
  43. length: number,
  44. isEscaped: boolean,
  45. isImportGet: boolean,
  46. inSaveFrame: boolean,
  47. lineNumber: number,
  48. tokenType: CifTokenType,
  49. tokenStart: number,
  50. tokenEnd: number,
  51. runtimeCtx: RuntimeContext
  52. }
  53. /**
  54. * Eat everything until a whitespace/newline occurs.
  55. */
  56. function eatValue(state: TokenizerState) {
  57. while (state.position < state.length) {
  58. switch (state.data.charCodeAt(state.position)) {
  59. case 9: // \t
  60. case 10: // \n
  61. case 13: // \r
  62. case 32: // ' '
  63. state.tokenEnd = state.position;
  64. return;
  65. default:
  66. ++state.position;
  67. break;
  68. }
  69. }
  70. state.tokenEnd = state.position;
  71. }
  72. /**
  73. * Eats an escaped value. Handles the "degenerate" cases as well.
  74. *
  75. * "Degenerate" cases:
  76. * - 'xx'x' => xx'x
  77. * - 'xxxNEWLINE => 'xxx
  78. *
  79. */
  80. function eatEscaped(state: TokenizerState, esc: number) {
  81. let next: number, c: number;
  82. ++state.position;
  83. while (state.position < state.length) {
  84. c = state.data.charCodeAt(state.position);
  85. if (c === esc) {
  86. next = state.data.charCodeAt(state.position + 1);
  87. switch (next) {
  88. case 9: // \t
  89. case 10: // \n
  90. case 13: // \r
  91. case 32: // ' '
  92. // get rid of the quotes.
  93. state.tokenStart++;
  94. state.tokenEnd = state.position;
  95. state.isEscaped = true;
  96. ++state.position;
  97. return;
  98. default:
  99. if (next === void 0) { // = "end of stream"
  100. // get rid of the quotes.
  101. state.tokenStart++;
  102. state.tokenEnd = state.position;
  103. state.isEscaped = true;
  104. ++state.position;
  105. return;
  106. }
  107. ++state.position;
  108. break;
  109. }
  110. } else {
  111. // handle 'xxxNEWLINE => 'xxx
  112. if (c === 10 || c === 13) {
  113. state.tokenEnd = state.position;
  114. return;
  115. }
  116. ++state.position;
  117. }
  118. }
  119. state.tokenEnd = state.position;
  120. }
  121. /**
  122. * Eats an escaped value "triple quote" (''') value.
  123. */
  124. function eatTripleQuote(state: TokenizerState) {
  125. // skip the '''
  126. state.position += 3;
  127. while (state.position < state.length) {
  128. if (state.data.charCodeAt(state.position) === 39 /* ' */ && isTripleQuoteAtPosition(state)) {
  129. // get rid of the quotes.
  130. state.tokenStart += 3;
  131. state.tokenEnd = state.position;
  132. state.isEscaped = true;
  133. state.position += 3;
  134. return;
  135. }
  136. ++state.position;
  137. }
  138. state.tokenEnd = state.position;
  139. }
  140. /**
  141. * Eats a multiline token of the form NL;....NL;
  142. */
  143. function eatMultiline(state: TokenizerState) {
  144. let prev = 59, pos = state.position + 1, c: number;
  145. while (pos < state.length) {
  146. c = state.data.charCodeAt(pos);
  147. if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r
  148. state.position = pos + 1;
  149. // get rid of the ;
  150. state.tokenStart++;
  151. // remove trailing newlines
  152. pos--;
  153. c = state.data.charCodeAt(pos);
  154. while (c === 10 || c === 13) {
  155. pos--;
  156. c = state.data.charCodeAt(pos);
  157. }
  158. state.tokenEnd = pos + 1;
  159. state.isEscaped = true;
  160. return;
  161. } else {
  162. // handle line numbers
  163. if (c === 13) { // \r
  164. state.lineNumber++;
  165. } else if (c === 10 && prev !== 13) { // \r\n
  166. state.lineNumber++;
  167. }
  168. prev = c;
  169. ++pos;
  170. }
  171. }
  172. state.position = pos;
  173. return prev;
  174. }
  175. function eatImportGet(state: TokenizerState) {
  176. // _import.get [{'save':orient_matrix 'file':templ_attr.cif}]
  177. // skipWhitespace(state)
  178. while (state.position < state.length) {
  179. switch (state.data.charCodeAt(state.position)) {
  180. case 93: // ]
  181. ++state.position;
  182. state.tokenEnd = state.position;
  183. state.isImportGet = false;
  184. return;
  185. default:
  186. ++state.position;
  187. break;
  188. }
  189. }
  190. }
  191. /**
  192. * Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function.
  193. */
  194. function skipCommentLine(state: TokenizerState) {
  195. while (state.position < state.length) {
  196. const c = state.data.charCodeAt(state.position);
  197. if (c === 10 || c === 13) {
  198. return;
  199. }
  200. ++state.position;
  201. }
  202. }
  203. /**
  204. * Skips all the whitespace - space, tab, newline, CR
  205. * Handles incrementing line count.
  206. */
  207. function skipWhitespace(state: TokenizerState): number {
  208. let prev = 10;
  209. while (state.position < state.length) {
  210. const c = state.data.charCodeAt(state.position);
  211. switch (c) {
  212. case 9: // '\t'
  213. case 32: // ' '
  214. prev = c;
  215. ++state.position;
  216. break;
  217. case 10: // \n
  218. // handle \r\n
  219. if (prev !== 13) {
  220. ++state.lineNumber;
  221. }
  222. prev = c;
  223. ++state.position;
  224. break;
  225. case 13: // \r
  226. prev = c;
  227. ++state.position;
  228. ++state.lineNumber;
  229. break;
  230. default:
  231. return prev;
  232. }
  233. }
  234. return prev;
  235. }
  236. /**
  237. * Returns true if there are two consecutive ' in +1 and +2 positions.
  238. */
  239. function isTripleQuoteAtPosition(state: TokenizerState): boolean {
  240. if (state.length - state.position < 2) return false;
  241. if (state.data.charCodeAt(state.position + 1) !== 39) return false; // '
  242. if (state.data.charCodeAt(state.position + 2) !== 39) return false; // '
  243. return true;
  244. }
  245. function isData(state: TokenizerState): boolean {
  246. // here we already assume the 5th char is _ and that the length >= 5
  247. // d/D
  248. let c = state.data.charCodeAt(state.tokenStart);
  249. if (c !== 68 && c !== 100) return false;
  250. // a/A
  251. c = state.data.charCodeAt(state.tokenStart + 1);
  252. if (c !== 65 && c !== 97) return false;
  253. // t/t
  254. c = state.data.charCodeAt(state.tokenStart + 2);
  255. if (c !== 84 && c !== 116) return false;
  256. // a/A
  257. c = state.data.charCodeAt(state.tokenStart + 3);
  258. if (c !== 65 && c !== 97) return false;
  259. return true;
  260. }
  261. function isSave(state: TokenizerState): boolean {
  262. // here we already assume the 5th char is _ and that the length >= 5
  263. // s/S
  264. let c = state.data.charCodeAt(state.tokenStart);
  265. if (c !== 83 && c !== 115) return false;
  266. // a/A
  267. c = state.data.charCodeAt(state.tokenStart + 1);
  268. if (c !== 65 && c !== 97) return false;
  269. // v/V
  270. c = state.data.charCodeAt(state.tokenStart + 2);
  271. if (c !== 86 && c !== 118) return false;
  272. // e/E
  273. c = state.data.charCodeAt(state.tokenStart + 3);
  274. if (c !== 69 && c !== 101) return false;
  275. return true;
  276. }
  277. function isLoop(state: TokenizerState): boolean {
  278. // here we already assume the 5th char is _ and that the length >= 5
  279. if (state.tokenEnd - state.tokenStart !== 5) return false;
  280. // l/L
  281. let c = state.data.charCodeAt(state.tokenStart);
  282. if (c !== 76 && c !== 108) return false;
  283. // o/O
  284. c = state.data.charCodeAt(state.tokenStart + 1);
  285. if (c !== 79 && c !== 111) return false;
  286. // o/O
  287. c = state.data.charCodeAt(state.tokenStart + 2);
  288. if (c !== 79 && c !== 111) return false;
  289. // p/P
  290. c = state.data.charCodeAt(state.tokenStart + 3);
  291. if (c !== 80 && c !== 112) return false;
  292. return true;
  293. }
  294. function isImportGet(state: TokenizerState): boolean {
  295. // _import.get [{'save':orient_matrix 'file':templ_attr.cif}]
  296. if (state.tokenEnd - state.tokenStart !== 11) return false;
  297. if (state.data.charCodeAt(state.tokenStart + 1) !== 105) return false; // i
  298. if (state.data.charCodeAt(state.tokenStart + 2) !== 109) return false; // m
  299. if (state.data.charCodeAt(state.tokenStart + 3) !== 112) return false; // p
  300. if (state.data.charCodeAt(state.tokenStart + 4) !== 111) return false; // o
  301. if (state.data.charCodeAt(state.tokenStart + 5) !== 114) return false; // r
  302. if (state.data.charCodeAt(state.tokenStart + 6) !== 116) return false; // t
  303. if (state.data.charCodeAt(state.tokenStart + 7) !== 46) return false; // .
  304. if (state.data.charCodeAt(state.tokenStart + 8) !== 103) return false; // g
  305. if (state.data.charCodeAt(state.tokenStart + 9) !== 101) return false; // e
  306. if (state.data.charCodeAt(state.tokenStart + 10) !== 116) return false; // t
  307. return true;
  308. }
  309. /**
  310. * Checks if the current token shares the namespace with string at <start,end).
  311. */
  312. function isNamespace(state: TokenizerState, start: number, end: number): boolean {
  313. let i: number;
  314. const nsLen = end - start;
  315. const offset = state.tokenStart - start;
  316. const tokenLen = state.tokenEnd - state.tokenStart;
  317. if (tokenLen < nsLen) return false;
  318. for (i = start; i < end; ++i) {
  319. if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset)) return false;
  320. }
  321. if (nsLen === tokenLen) return true;
  322. if (state.data.charCodeAt(i + offset) === 46) { // .
  323. return true;
  324. }
  325. return false;
  326. }
  327. /**
  328. * Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd.
  329. */
  330. function getNamespaceEnd(state: TokenizerState): number {
  331. let i: number;
  332. for (i = state.tokenStart; i < state.tokenEnd; ++i) {
  333. if (state.data.charCodeAt(i) === 46) return i;
  334. }
  335. return i;
  336. }
  337. /**
  338. * Get the namespace string. endIndex is obtained by the getNamespaceEnd() function.
  339. */
  340. function getNamespace(state: TokenizerState, endIndex: number) {
  341. return state.data.substring(state.tokenStart, endIndex);
  342. }
  343. /**
  344. * Returns true if the current token contain no '.', otherwise returns false.
  345. */
  346. function isFlatNamespace(state: TokenizerState): boolean {
  347. let i: number;
  348. for (i = state.tokenStart; i < state.tokenEnd; ++i) {
  349. if (state.data.charCodeAt(i) === 46) return false;
  350. }
  351. return true;
  352. }
  353. /**
  354. * String representation of the current token.
  355. */
  356. function getTokenString(state: TokenizerState) {
  357. return state.data.substring(state.tokenStart, state.tokenEnd);
  358. }
  359. /**
  360. * Move to the next token.
  361. */
  362. function moveNextInternal(state: TokenizerState) {
  363. const prev = skipWhitespace(state);
  364. if (state.position >= state.length) {
  365. state.tokenType = CifTokenType.End;
  366. return;
  367. }
  368. state.tokenStart = state.position;
  369. state.tokenEnd = state.position;
  370. state.isEscaped = false;
  371. const c = state.data.charCodeAt(state.position);
  372. switch (c) {
  373. case 35: // #, comment
  374. skipCommentLine(state);
  375. state.tokenType = CifTokenType.Comment;
  376. break;
  377. case 39: // ', escaped value
  378. if (isTripleQuoteAtPosition(state)) {
  379. eatTripleQuote(state);
  380. state.tokenType = CifTokenType.Value;
  381. break;
  382. }
  383. case 34: // ", escaped value
  384. eatEscaped(state, c);
  385. state.tokenType = CifTokenType.Value;
  386. break;
  387. case 59: // ;, possible multiline value
  388. // multiline value must start at the beginning of the line.
  389. if (prev === 10 || prev === 13) { // /n or /r
  390. eatMultiline(state);
  391. } else {
  392. eatValue(state);
  393. }
  394. state.tokenType = CifTokenType.Value;
  395. break;
  396. default:
  397. if (state.isImportGet) {
  398. eatImportGet(state);
  399. } else {
  400. eatValue(state);
  401. }
  402. // escaped is always Value
  403. if (state.isEscaped) {
  404. state.tokenType = CifTokenType.Value;
  405. // _ means column name, including _import.get
  406. } else if (state.data.charCodeAt(state.tokenStart) === 95) { // _
  407. if (state.inSaveFrame && isImportGet(state)) {
  408. state.isImportGet = true;
  409. }
  410. state.tokenType = CifTokenType.ColumnName;
  411. // 5th char needs to be _ for data_, save_ or loop_
  412. } else if (state.tokenEnd - state.tokenStart >= 5 && state.data.charCodeAt(state.tokenStart + 4) === 95) {
  413. if (isData(state)) state.tokenType = CifTokenType.Data;
  414. else if (isSave(state)) state.tokenType = CifTokenType.Save;
  415. else if (isLoop(state)) state.tokenType = CifTokenType.Loop;
  416. else state.tokenType = CifTokenType.Value;
  417. // all other tests failed, we are at Value token.
  418. } else {
  419. state.tokenType = CifTokenType.Value;
  420. }
  421. break;
  422. }
  423. }
  424. /**
  425. * Moves to the next non-comment token.
  426. */
  427. function moveNext(state: TokenizerState) {
  428. moveNextInternal(state);
  429. while (state.tokenType === CifTokenType.Comment) moveNextInternal(state);
  430. }
  431. function createTokenizer(data: string, runtimeCtx: RuntimeContext): TokenizerState {
  432. return {
  433. data,
  434. length: data.length,
  435. position: 0,
  436. tokenStart: 0,
  437. tokenEnd: 0,
  438. tokenType: CifTokenType.End,
  439. lineNumber: 1,
  440. isEscaped: false,
  441. isImportGet: false,
  442. inSaveFrame: false,
  443. runtimeCtx
  444. };
  445. }
  446. /**
  447. * Helper shape of the category result.
  448. */
  449. interface CifCategoryResult {
  450. hasError: boolean;
  451. errorLine: number;
  452. errorMessage: string;
  453. }
  454. interface CifCategoryData {
  455. name: string,
  456. rowCount: number,
  457. fieldNames: string[],
  458. fields: { [name: string]: Data.CifField }
  459. }
  460. type FrameContext = {
  461. categoryNames: string[],
  462. categoryData: { [name: string]: CifCategoryData }
  463. }
  464. function FrameContext(): FrameContext {
  465. return { categoryNames: [], categoryData: Object.create(null) };
  466. }
  467. function CifCategories(categoryNames: string[], categoryData: { [name: string]: CifCategoryData }): { [name: string]: Data.CifCategory } {
  468. const categories = Object.create(null);
  469. for (const name of categoryNames) {
  470. const d = categoryData[name];
  471. categories[name] = Data.CifCategory(d.name, d.rowCount, d.fieldNames, d.fields);
  472. }
  473. return categories;
  474. }
  475. function CifBlock(ctx: FrameContext, header: string, saveFrames?: Data.CifFrame[]): Data.CifBlock {
  476. return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header, saveFrames);
  477. }
  478. function CifSaveFrame(ctx: FrameContext, header: string): Data.CifBlock {
  479. return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header);
  480. }
  481. function addFields(ctx: FrameContext, name: string, rowCount: number, fieldNames: string[], fields: { [k: string]: Data.CifField }) {
  482. if (name in ctx.categoryData) {
  483. const cat = ctx.categoryData[name];
  484. cat.fieldNames.push(...fieldNames);
  485. Object.assign(cat.fields, fields);
  486. } else {
  487. ctx.categoryData[name] = { name, rowCount, fieldNames, fields };
  488. ctx.categoryNames.push(name);
  489. }
  490. }
  491. /**
  492. * Reads a category containing a single row.
  493. */
  494. function handleSingle(tokenizer: TokenizerState, ctx: FrameContext): CifCategoryResult {
  495. const nsStart = tokenizer.tokenStart, nsEnd = getNamespaceEnd(tokenizer);
  496. const name = getNamespace(tokenizer, nsEnd);
  497. const fields = Object.create(null);
  498. const fieldNames: string[] = [];
  499. let readingNames = true;
  500. while (readingNames) {
  501. if (tokenizer.tokenType !== CifTokenType.ColumnName || !isNamespace(tokenizer, nsStart, nsEnd)) {
  502. readingNames = false;
  503. break;
  504. }
  505. const fieldName = getTokenString(tokenizer).substring(name.length + 1);
  506. moveNext(tokenizer);
  507. if (tokenizer.tokenType as any !== CifTokenType.Value) {
  508. return {
  509. hasError: true,
  510. errorLine: tokenizer.lineNumber,
  511. errorMessage: 'Expected value.'
  512. };
  513. }
  514. fields[fieldName] = Data.CifField.ofTokens({ data: tokenizer.data, indices: [tokenizer.tokenStart, tokenizer.tokenEnd], count: 1 });
  515. fieldNames[fieldNames.length] = fieldName;
  516. moveNext(tokenizer);
  517. }
  518. addFields(ctx, name.substr(1), 1, fieldNames, fields);
  519. return {
  520. hasError: false,
  521. errorLine: 0,
  522. errorMessage: ''
  523. };
  524. }
  525. interface LoopReadState {
  526. tokenizer: TokenizerState,
  527. tokens: Tokens[],
  528. fieldCount: number,
  529. tokenCount: number
  530. }
  531. function readLoopChunk(chunkSize: number, state: LoopReadState) {
  532. const { tokenizer, tokens, fieldCount } = state;
  533. let tokenCount = state.tokenCount;
  534. let counter = 0;
  535. while (tokenizer.tokenType === CifTokenType.Value && counter < chunkSize) {
  536. TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.tokenStart, tokenizer.tokenEnd);
  537. moveNext(tokenizer);
  538. counter++;
  539. }
  540. state.tokenCount = tokenCount;
  541. return counter;
  542. }
  543. function updateLoopChunk(ctx: RuntimeContext, state: LoopReadState) {
  544. return ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length });
  545. }
  546. // const readLoopChunks = ChunkedSubtask(1000000,
  547. // (size, state: LoopReadState) => readLoopChunk(state, size),
  548. // (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length }));
  549. /**
  550. * Reads a loop.
  551. */
  552. async function handleLoop(tokenizer: TokenizerState, ctx: FrameContext): Promise<CifCategoryResult> {
  553. const loopLine = tokenizer.lineNumber;
  554. moveNext(tokenizer);
  555. const name = getNamespace(tokenizer, getNamespaceEnd(tokenizer));
  556. const isFlat = isFlatNamespace(tokenizer);
  557. const fieldNames: string[] = [];
  558. while (tokenizer.tokenType === CifTokenType.ColumnName) {
  559. fieldNames[fieldNames.length] = isFlat
  560. ? getTokenString(tokenizer)
  561. : getTokenString(tokenizer).substring(name.length + 1);
  562. moveNext(tokenizer);
  563. }
  564. const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
  565. const tokens: Tokens[] = [];
  566. const fieldCount = fieldNames.length;
  567. for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer.data, rowCountEstimate);
  568. const state: LoopReadState = {
  569. fieldCount,
  570. tokenCount: 0,
  571. tokenizer,
  572. tokens
  573. };
  574. await chunkedSubtask(tokenizer.runtimeCtx, 1000000, state, readLoopChunk, updateLoopChunk);
  575. if (state.tokenCount % fieldCount !== 0) {
  576. return {
  577. hasError: true,
  578. errorLine: tokenizer.lineNumber,
  579. errorMessage: `The number of values for loop starting at line ${loopLine} is not a multiple of the number of columns.`
  580. };
  581. }
  582. const rowCount = (state.tokenCount / fieldCount) | 0;
  583. if (isFlat) {
  584. for (let i = 0; i < fieldCount; i++) {
  585. const fields = { '': Data.CifField.ofTokens(tokens[i]) };
  586. addFields(ctx, fieldNames[i].substr(1), rowCount, [''], fields);
  587. }
  588. } else {
  589. const fields = Object.create(null);
  590. for (let i = 0; i < fieldCount; i++) {
  591. fields[fieldNames[i]] = Data.CifField.ofTokens(tokens[i]);
  592. }
  593. addFields(ctx, name.substr(1), rowCount, fieldNames, fields);
  594. }
  595. return {
  596. hasError: false,
  597. errorLine: 0,
  598. errorMessage: ''
  599. };
  600. }
  601. /**
  602. * Creates an error result.
  603. */
  604. function error(line: number, message: string) {
  605. return Result.error<Data.CifFile>(message, line);
  606. }
  607. /**
  608. * Creates a data result.
  609. */
  610. function result(data: Data.CifFile) {
  611. return Result.success(data);
  612. }
  613. /**
  614. * Parses an mmCIF file.
  615. *
  616. * @returns CifParserResult wrapper of the result.
  617. */
  618. async function parseInternal(data: string, runtimeCtx: RuntimeContext) {
  619. const dataBlocks: Data.CifBlock[] = [];
  620. const tokenizer = createTokenizer(data, runtimeCtx);
  621. let blockHeader = '';
  622. let blockCtx = FrameContext();
  623. // the next three initial values are never used in valid files
  624. let saveFrames: Data.CifFrame[] = [];
  625. let saveCtx = FrameContext();
  626. const saveFrame: Data.CifFrame = Data.CifSaveFrame(
  627. saveCtx.categoryNames, CifCategories(saveCtx.categoryNames, saveCtx.categoryData), ''
  628. );
  629. let saveHeader = '';
  630. runtimeCtx.update({ message: 'Parsing...', current: 0, max: data.length });
  631. moveNext(tokenizer);
  632. while (tokenizer.tokenType !== CifTokenType.End) {
  633. const token = tokenizer.tokenType;
  634. // Data block
  635. if (token === CifTokenType.Data) {
  636. if (tokenizer.inSaveFrame) {
  637. return error(tokenizer.lineNumber, 'Unexpected data block inside a save frame.');
  638. }
  639. if (blockCtx.categoryNames.length > 0) {
  640. dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames));
  641. }
  642. blockHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  643. blockCtx = FrameContext();
  644. saveFrames = [];
  645. moveNext(tokenizer);
  646. // Save frame
  647. } else if (token === CifTokenType.Save) {
  648. if (tokenizer.tokenEnd - tokenizer.tokenStart === 5) { // end of save frame
  649. if (saveCtx.categoryNames.length > 0) {
  650. saveFrames[saveFrames.length] = CifSaveFrame(saveCtx, saveHeader);
  651. }
  652. tokenizer.inSaveFrame = false;
  653. } else { // start of save frame
  654. if (tokenizer.inSaveFrame) {
  655. return error(tokenizer.lineNumber, 'Save frames cannot be nested.');
  656. }
  657. tokenizer.inSaveFrame = true;
  658. saveHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
  659. saveCtx = FrameContext();
  660. // saveFrame = CifSaveFrame(saveCtx, saveHeader);
  661. }
  662. moveNext(tokenizer);
  663. // Loop
  664. } else if (token === CifTokenType.Loop) {
  665. const cat = await handleLoop(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx);
  666. if (cat.hasError) {
  667. return error(cat.errorLine, cat.errorMessage);
  668. }
  669. // Single row
  670. } else if (token === CifTokenType.ColumnName) {
  671. const cat = handleSingle(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx);
  672. if (cat.hasError) {
  673. return error(cat.errorLine, cat.errorMessage);
  674. }
  675. // Out of options
  676. } else {
  677. console.log(tokenizer.tokenType, Tokenizer.getTokenString(tokenizer));
  678. return error(tokenizer.lineNumber, 'Unexpected token. Expected data_, loop_, or data name.');
  679. }
  680. }
  681. // Check if the latest save frame was closed.
  682. if (tokenizer.inSaveFrame) {
  683. return error(tokenizer.lineNumber, `Unfinished save frame (${saveFrame.header}).`);
  684. }
  685. if (blockCtx.categoryNames.length > 0 || saveFrames.length > 0) {
  686. dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames));
  687. }
  688. return result(Data.CifFile(dataBlocks));
  689. }
  690. export function parseCifText(data: string) {
  691. return Task.create<Result<Data.CifFile>>('Parse CIF', async ctx => {
  692. return await parseInternal(data, ctx);
  693. });
  694. }