parser.ts 7.5 KB


  1. /**
  2. * Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author David Sehnal <david.sehnal@gmail.com>
  5. * @author Panagiotis Tourlas <panagiot_tourlov@hotmail.com>
  6. */
  7. import { Column } from '../../../mol-data/db';
  8. import { Task } from '../../../mol-task';
  9. import { TokenColumnProvider as TokenColumn } from '../common/text/column/token';
  10. import { TokenBuilder, Tokenizer } from '../common/text/tokenizer';
  11. import { ReaderResult as Result } from '../result';
  12. /** Subset of the MolFile V2000 format */
  13. export interface MolFile {
  14. readonly title: string,
  15. readonly program: string,
  16. readonly comment: string,
  17. readonly atoms: {
  18. readonly count: number,
  19. readonly x: Column<number>,
  20. readonly y: Column<number>,
  21. readonly z: Column<number>,
  22. readonly type_symbol: Column<string>,
  23. readonly formal_charge: Column<number>
  24. },
  25. readonly bonds: {
  26. readonly count: number
  27. readonly atomIdxA: Column<number>,
  28. readonly atomIdxB: Column<number>,
  29. readonly order: Column<number>
  30. }
  31. readonly formalCharges: {
  32. readonly atomIdx: Column<number>;
  33. readonly charge: Column<number>;
  34. }
  35. }
  36. /*
  37. The atom lines in a .mol file have the following structure:
  38. xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
  39. ---------------------------------------------------------------------
  40. Below is a breakdown of each component and its start/end indices:
  41. xxxxx.xxxx (X COORDINATE, 1-10)
  42. yyyyy.yyyy (Y COORDINATE, 10-20)
  43. zzzzz.zzzz (Z COORDINATE, 20-30)
  44. _ (30 IS EMPTY)
  45. aaa (ATOM SYMBOL, 31-34)
  46. dd (MASS DIFF, 34-36)
  47. ccc (FORMAL CHARGE, 36-39)
  48. sss (ATOM STEREO PARITY, 39-42)
  49. hhh (HYDROGEN COUNT+1, 42-45)
  50. bbb (STEREO CARE BOX, 45-48)
  51. vvv (VALENCE, 48-51)
  52. HHH (H0 DESIGNATOR, 51-54)
  53. rrr (UNUSED, 54-57)
  54. iii (UNUSED, 57-60)
  55. mmm (ATOM-ATOM MAPPING NUMBER, 60-63)
  56. nnn (INVERSION/RETENTION FLAG, 63-66)
  57. eee (EXACT CHANGE FLAG, 66-69)
  58. */
  59. export function handleAtoms(tokenizer: Tokenizer, count: number): MolFile['atoms'] {
  60. const x = TokenBuilder.create(tokenizer.data, count * 2);
  61. const y = TokenBuilder.create(tokenizer.data, count * 2);
  62. const z = TokenBuilder.create(tokenizer.data, count * 2);
  63. const type_symbol = TokenBuilder.create(tokenizer.data, count * 2);
  64. const formal_charge = TokenBuilder.create(tokenizer.data, count * 2);
  65. for (let i = 0; i < count; ++i) {
  66. Tokenizer.markLine(tokenizer);
  67. const { tokenStart: s, position } = tokenizer;
  68. Tokenizer.trim(tokenizer, s, s + 10);
  69. TokenBuilder.addUnchecked(x, tokenizer.tokenStart, tokenizer.tokenEnd);
  70. Tokenizer.trim(tokenizer, s + 10, s + 20);
  71. TokenBuilder.addUnchecked(y, tokenizer.tokenStart, tokenizer.tokenEnd);
  72. Tokenizer.trim(tokenizer, s + 20, s + 30);
  73. TokenBuilder.addUnchecked(z, tokenizer.tokenStart, tokenizer.tokenEnd);
  74. Tokenizer.trim(tokenizer, s + 31, s + 34);
  75. TokenBuilder.addUnchecked(type_symbol, tokenizer.tokenStart, tokenizer.tokenEnd);
  76. Tokenizer.trim(tokenizer, s + 36, s + 39);
  77. TokenBuilder.addUnchecked(formal_charge, tokenizer.tokenStart, tokenizer.tokenEnd);
  78. tokenizer.position = position;
  79. }
  80. return {
  81. count,
  82. x: TokenColumn(x)(Column.Schema.float),
  83. y: TokenColumn(y)(Column.Schema.float),
  84. z: TokenColumn(z)(Column.Schema.float),
  85. type_symbol: TokenColumn(type_symbol)(Column.Schema.str),
  86. formal_charge: TokenColumn(formal_charge)(Column.Schema.int)
  87. };
  88. }
  89. export function handleBonds(tokenizer: Tokenizer, count: number): MolFile['bonds'] {
  90. const atomIdxA = TokenBuilder.create(tokenizer.data, count * 2);
  91. const atomIdxB = TokenBuilder.create(tokenizer.data, count * 2);
  92. const order = TokenBuilder.create(tokenizer.data, count * 2);
  93. for (let i = 0; i < count; ++i) {
  94. Tokenizer.markLine(tokenizer);
  95. const { tokenStart: s, position } = tokenizer;
  96. Tokenizer.trim(tokenizer, s, s + 3);
  97. TokenBuilder.addUnchecked(atomIdxA, tokenizer.tokenStart, tokenizer.tokenEnd);
  98. Tokenizer.trim(tokenizer, s + 3, s + 6);
  99. TokenBuilder.addUnchecked(atomIdxB, tokenizer.tokenStart, tokenizer.tokenEnd);
  100. Tokenizer.trim(tokenizer, s + 6, s + 9);
  101. TokenBuilder.addUnchecked(order, tokenizer.tokenStart, tokenizer.tokenEnd);
  102. tokenizer.position = position;
  103. }
  104. return {
  105. count,
  106. atomIdxA: TokenColumn(atomIdxA)(Column.Schema.int),
  107. atomIdxB: TokenColumn(atomIdxB)(Column.Schema.int),
  108. order: TokenColumn(order)(Column.Schema.int)
  109. };
  110. }
  111. export function handleFormalCharges(tokenizer: Tokenizer, count: number): MolFile['formalCharges'] {
  112. const atomIdx = TokenBuilder.create(tokenizer.data, count * 2);
  113. const charge = TokenBuilder.create(tokenizer.data, count * 2);
  114. let i = 0;
  115. while (i < 100) {
  116. /* An attempt to explain what happens.
  117. Once handleFormalCharges() is called, the atom and bond sections have
  118. been parsed. We are now inside the properties block of the file.
  119. Therefore, the "pointer" of the reader is at position 0:
  120. M CHG 1 2 -1
  121. ^
  122. Read the property type (positions 3 to 5):
  123. M CHG 1 2 -1
  124. ___^^^
  125. If it's a charge property (CHG) we'll add it to the list of
  126. formal charges.
  127. We read the characters at positions 12 to 14 (2__),
  128. cleanup the spaces/tabs (2) and assign it to atomIdx property of
  129. the "MolFile" object.
  130. Same for the next triplet at positions 15 to 17.
  131. (-1_) becomes (-1) and is assigned to
  132. charge property of the "MolFile" object.
  133. */
  134. Tokenizer.markLine(tokenizer);
  135. const { tokenStart: s } = tokenizer;
  136. Tokenizer.trim(tokenizer, s + 3, s + 6);
  137. const propertyType = Tokenizer.getTokenString(tokenizer);
  138. if (propertyType === 'CHG') {
  139. Tokenizer.trim(tokenizer, s + 12, s + 15);
  140. TokenBuilder.addUnchecked(atomIdx, tokenizer.tokenStart, tokenizer.tokenEnd);
  141. Tokenizer.trim(tokenizer, s + 15, s + 18);
  142. TokenBuilder.addUnchecked(charge, tokenizer.tokenStart, tokenizer.tokenEnd);
  143. }
  144. if (propertyType === 'END') break;
  145. i++;
  146. }
  147. return {
  148. atomIdx: TokenColumn(atomIdx)(Column.Schema.int),
  149. charge: TokenColumn(charge)(Column.Schema.int),
  150. };
  151. }
  152. function parseInternal(data: string): Result<MolFile> {
  153. const tokenizer = Tokenizer(data);
  154. const title = Tokenizer.readLine(tokenizer).trim();
  155. const program = Tokenizer.readLine(tokenizer).trim();
  156. const comment = Tokenizer.readLine(tokenizer).trim();
  157. const counts = Tokenizer.readLine(tokenizer);
  158. const atomCount = +counts.substr(0, 3), bondCount = +counts.substr(3, 3);
  159. const atoms = handleAtoms(tokenizer, atomCount);
  160. const bonds = handleBonds(tokenizer, bondCount);
  161. const formalCharges = handleFormalCharges(tokenizer, atomCount);
  162. const result: MolFile = {
  163. title,
  164. program,
  165. comment,
  166. atoms,
  167. bonds,
  168. formalCharges,
  169. };
  170. return Result.success(result);
  171. }
  172. export function parseMol(data: string) {
  173. return Task.create<Result<MolFile>>('Parse Mol', async () => {
  174. return parseInternal(data);
  175. });
  176. }