parser.ts 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /**
  2. * Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author David Sehnal <david.sehnal@gmail.com>
  5. * @author Panagiotis Tourlas <panagiot_tourlov@hotmail.com>
  6. */
  7. import { Column } from '../../../mol-data/db';
  8. import { Task } from '../../../mol-task';
  9. import { TokenColumnProvider as TokenColumn } from '../common/text/column/token';
  10. import { TokenBuilder, Tokenizer } from '../common/text/tokenizer';
  11. import { ReaderResult as Result } from '../result';
  12. /** Subset of the MolFile V2000 format */
  13. export interface MolFile {
  14. readonly title: string,
  15. readonly program: string,
  16. readonly comment: string,
  17. readonly atoms: {
  18. readonly count: number,
  19. readonly x: Column<number>,
  20. readonly y: Column<number>,
  21. readonly z: Column<number>,
  22. readonly type_symbol: Column<string>,
  23. readonly formal_charge: Column<number>
  24. },
  25. readonly bonds: {
  26. readonly count: number
  27. readonly atomIdxA: Column<number>,
  28. readonly atomIdxB: Column<number>,
  29. readonly order: Column<number>
  30. }
  31. readonly formalCharges: {
  32. readonly atomIdx: Column<number>;
  33. readonly charge: Column<number>;
  34. } | null
  35. }
  36. /*
  37. The atom lines in a .mol file have the following structure:
  38. xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
  39. ---------------------------------------------------------------------
  40. Below is a breakdown of each component and its start/end indices:
  41. xxxxx.xxxx (X COORDINATE, 1-10)
  42. yyyyy.yyyy (Y COORDINATE, 10-20)
  43. zzzzz.zzzz (Z COORDINATE, 20-30)
  44. _ (30 IS EMPTY)
  45. aaa (ATOM SYMBOL, 31-34)
  46. dd (MASS DIFF, 34-36)
  47. ccc (FORMAL CHARGE, 36-39)
  48. sss (ATOM STEREO PARITY, 39-42)
  49. hhh (HYDROGEN COUNT+1, 42-45)
  50. bbb (STEREO CARE BOX, 45-48)
  51. vvv (VALENCE, 48-51)
  52. HHH (H0 DESIGNATOR, 51-54)
  53. rrr (UNUSED, 54-57)
  54. iii (UNUSED, 57-60)
  55. mmm (ATOM-ATOM MAPPING NUMBER, 60-63)
  56. nnn (INVERSION/RETENTION FLAG, 63-66)
  57. eee (EXACT CHANGE FLAG, 66-69)
  58. */
  59. export function handleAtoms(tokenizer: Tokenizer, count: number): MolFile['atoms'] {
  60. const x = TokenBuilder.create(tokenizer.data, count * 2);
  61. const y = TokenBuilder.create(tokenizer.data, count * 2);
  62. const z = TokenBuilder.create(tokenizer.data, count * 2);
  63. const type_symbol = TokenBuilder.create(tokenizer.data, count * 2);
  64. const formal_charge = TokenBuilder.create(tokenizer.data, count * 2);
  65. for (let i = 0; i < count; ++i) {
  66. Tokenizer.markLine(tokenizer);
  67. const { tokenStart: s, position } = tokenizer;
  68. Tokenizer.trim(tokenizer, s, s + 10);
  69. TokenBuilder.addUnchecked(x, tokenizer.tokenStart, tokenizer.tokenEnd);
  70. Tokenizer.trim(tokenizer, s + 10, s + 20);
  71. TokenBuilder.addUnchecked(y, tokenizer.tokenStart, tokenizer.tokenEnd);
  72. Tokenizer.trim(tokenizer, s + 20, s + 30);
  73. TokenBuilder.addUnchecked(z, tokenizer.tokenStart, tokenizer.tokenEnd);
  74. Tokenizer.trim(tokenizer, s + 31, s + 34);
  75. TokenBuilder.addUnchecked(type_symbol, tokenizer.tokenStart, tokenizer.tokenEnd);
  76. Tokenizer.trim(tokenizer, s + 36, s + 39);
  77. TokenBuilder.addUnchecked(formal_charge, tokenizer.tokenStart, tokenizer.tokenEnd);
  78. tokenizer.position = position;
  79. }
  80. return {
  81. count,
  82. x: TokenColumn(x)(Column.Schema.float),
  83. y: TokenColumn(y)(Column.Schema.float),
  84. z: TokenColumn(z)(Column.Schema.float),
  85. type_symbol: TokenColumn(type_symbol)(Column.Schema.str),
  86. formal_charge: TokenColumn(formal_charge)(Column.Schema.int)
  87. };
  88. }
  89. export function handleBonds(tokenizer: Tokenizer, count: number): MolFile['bonds'] {
  90. const atomIdxA = TokenBuilder.create(tokenizer.data, count * 2);
  91. const atomIdxB = TokenBuilder.create(tokenizer.data, count * 2);
  92. const order = TokenBuilder.create(tokenizer.data, count * 2);
  93. for (let i = 0; i < count; ++i) {
  94. Tokenizer.markLine(tokenizer);
  95. const { tokenStart: s, position } = tokenizer;
  96. Tokenizer.trim(tokenizer, s, s + 3);
  97. TokenBuilder.addUnchecked(atomIdxA, tokenizer.tokenStart, tokenizer.tokenEnd);
  98. Tokenizer.trim(tokenizer, s + 3, s + 6);
  99. TokenBuilder.addUnchecked(atomIdxB, tokenizer.tokenStart, tokenizer.tokenEnd);
  100. Tokenizer.trim(tokenizer, s + 6, s + 9);
  101. TokenBuilder.addUnchecked(order, tokenizer.tokenStart, tokenizer.tokenEnd);
  102. tokenizer.position = position;
  103. }
  104. return {
  105. count,
  106. atomIdxA: TokenColumn(atomIdxA)(Column.Schema.int),
  107. atomIdxB: TokenColumn(atomIdxB)(Column.Schema.int),
  108. order: TokenColumn(order)(Column.Schema.int)
  109. };
  110. }
  111. export function handleFormalCharges(tokenizer: Tokenizer, lineStart: number): MolFile['formalCharges'] {
  112. Tokenizer.trim(tokenizer, lineStart + 6, lineStart + 9);
  113. const numOfCharges = parseInt(Tokenizer.getTokenString(tokenizer));
  114. const atomIdx = TokenBuilder.create(tokenizer.data, numOfCharges * 2);
  115. const charge = TokenBuilder.create(tokenizer.data, numOfCharges * 2);
  116. for (let i = 0; i < numOfCharges; ++i) {
  117. /*
  118. M CHG 3 1 -1 2 0 2 -1
  119. | | | | |
  120. | | | | |__charge2 (etc.)
  121. | | | |
  122. | | | |__atomIdx2
  123. | | |
  124. | | |__charge1
  125. | |
  126. | |__atomIdx1 (cursor at position 12)
  127. |
  128. |___numOfCharges
  129. */
  130. const offset = 9 + (i * 8);
  131. Tokenizer.trim(tokenizer, lineStart + offset, lineStart + offset + 4);
  132. TokenBuilder.addUnchecked(atomIdx, tokenizer.tokenStart, tokenizer.tokenEnd);
  133. console.log('id', Tokenizer.getTokenString(tokenizer));
  134. Tokenizer.trim(tokenizer, lineStart + offset + 4, lineStart + offset + 8);
  135. TokenBuilder.addUnchecked(charge, tokenizer.tokenStart, tokenizer.tokenEnd);
  136. console.log('chg', Tokenizer.getTokenString(tokenizer));
  137. }
  138. /*
  139. Once the line is read, move to the next one.
  140. Otherwise the cursor will be one position behind on the next line.
  141. */
  142. Tokenizer.eatLine(tokenizer);
  143. return {
  144. atomIdx: TokenColumn(atomIdx)(Column.Schema.int),
  145. charge: TokenColumn(charge)(Column.Schema.int),
  146. };
  147. }
  148. /** Call an appropriate handler based on the property type.
  149. * (For now it only calls the formal charge handler, additional handlers can
  150. * be added for other properties.)
  151. */
  152. export function handlePropertiesBlock(tokenizer: Tokenizer): MolFile['formalCharges'] {
  153. let formalCharges = null;
  154. let i = 0;
  155. while (i < 50) { // Added a "big" value to avoid any infinite loops by accident.
  156. const { position: s } = tokenizer;
  157. Tokenizer.trim(tokenizer, s + 3, s + 6);
  158. const propertyType = Tokenizer.getTokenString(tokenizer);
  159. if (propertyType === 'END') break;
  160. Tokenizer.eatLine(tokenizer);
  161. switch (propertyType) {
  162. case 'CHG':
  163. formalCharges = handleFormalCharges(tokenizer, s);
  164. break;
  165. default:
  166. break;
  167. }
  168. i++;
  169. }
  170. return formalCharges;
  171. }
  172. function parseInternal(data: string): Result<MolFile> {
  173. const tokenizer = Tokenizer(data);
  174. const title = Tokenizer.readLine(tokenizer).trim();
  175. const program = Tokenizer.readLine(tokenizer).trim();
  176. const comment = Tokenizer.readLine(tokenizer).trim();
  177. const counts = Tokenizer.readLine(tokenizer);
  178. const atomCount = +counts.substr(0, 3), bondCount = +counts.substr(3, 3);
  179. const atoms = handleAtoms(tokenizer, atomCount);
  180. const bonds = handleBonds(tokenizer, bondCount);
  181. const formalCharges = handlePropertiesBlock(tokenizer);
  182. const result: MolFile = {
  183. title,
  184. program,
  185. comment,
  186. atoms,
  187. bonds,
  188. formalCharges,
  189. };
  190. return Result.success(result);
  191. }
  192. export function parseMol(data: string) {
  193. return Task.create<Result<MolFile>>('Parse Mol', async () => {
  194. return parseInternal(data);
  195. });
  196. }