parser.ts 14 KB


  1. /**
  2. * Copyright (c) 2017-2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author Zepei Xu <xuzepei19950617@gmail.com>
  5. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  6. */
  7. // NOTES
  8. // When want to created undefined string column, must use
  9. // undefStr = UndefinedColumn(molecule.num_atoms, ColumnType.str)
  10. // but not
  11. // const undefPooledStr = UndefinedColumn(molecule.num_atoms, ColumnType.pooledStr);
  12. // because latter actuall return a column of zeros
  13. import { Column } from 'mol-data/db'
  14. import { TokenBuilder, Tokenizer } from '../common/text/tokenizer'
  15. import TokenColumn from '../common/text/column/token'
  16. import * as Schema from './schema'
  17. import { ReaderResult as Result } from '../result'
  18. import { Task, RuntimeContext, chunkedSubtask } from 'mol-task'
  19. const { skipWhitespace, eatValue, markLine, getTokenString, readLine } = Tokenizer;
  20. interface State {
  21. tokenizer: Tokenizer,
  22. molecule: Schema.Mol2Molecule,
  23. runtimeCtx: RuntimeContext
  24. }
  25. function createEmptyMolecule(): Schema.Mol2Molecule {
  26. return {
  27. mol_name: '',
  28. num_atoms: 0,
  29. num_bonds: 0,
  30. num_subst: 0,
  31. num_feat: 0,
  32. num_sets: 0,
  33. mol_type: '',
  34. charge_type: '',
  35. status_bits: '',
  36. mol_comment: ''
  37. };
  38. }
  39. function State(tokenizer: Tokenizer, runtimeCtx: RuntimeContext): State {
  40. return {
  41. tokenizer,
  42. molecule: createEmptyMolecule(),
  43. runtimeCtx
  44. };
  45. }
  46. const reWhitespace = /\s+/g;
  47. function handleMolecule(state: State) {
  48. const { tokenizer, molecule } = state;
  49. while (getTokenString(tokenizer) !== '@<TRIPOS>MOLECULE') {
  50. markLine(tokenizer);
  51. }
  52. markLine(tokenizer);
  53. molecule.mol_name = getTokenString(tokenizer);
  54. markLine(tokenizer);
  55. const values = getTokenString(tokenizer).trim().split(reWhitespace);
  56. molecule.num_atoms = parseInt(values[0]);
  57. molecule.num_bonds = parseInt(values[1]);
  58. molecule.num_subst = parseInt(values[2]);
  59. molecule.num_feat = parseInt(values[3]);
  60. molecule.num_sets = parseInt(values[4]);
  61. markLine(tokenizer);
  62. molecule.mol_type = getTokenString(tokenizer);
  63. markLine(tokenizer);
  64. molecule.charge_type = getTokenString(tokenizer);
  65. markLine(tokenizer);
  66. if (getTokenString(tokenizer) === '') return
  67. molecule.status_bits = getTokenString(tokenizer)
  68. markLine(tokenizer);
  69. if (getTokenString(tokenizer) === '') return
  70. molecule.mol_comment = getTokenString(tokenizer)
  71. }
  72. function isStatus_bit(aString: String): Boolean {
  73. if (aString.includes('DSPMOD') || aString.includes('TYPECOL') || aString.includes('CAP')
  74. || aString.includes('BACKBONE') || aString.includes('DICT') || aString.includes('ESSENTIAL')
  75. || aString.includes('WATER') || aString.includes('DIRECT')) {
  76. return true;
  77. }
  78. return false;
  79. }
  80. async function handleAtoms(state: State): Promise<Schema.Mol2Atoms> {
  81. const { tokenizer, molecule } = state;
  82. let hasSubst_id = false;
  83. let hasSubst_name = false;
  84. let hasCharge = false;
  85. let hasStatus_bit = false;
  86. // skip empty lines and '@<TRIPOS>ATOM'
  87. while (getTokenString(tokenizer) !== '@<TRIPOS>ATOM') {
  88. markLine(tokenizer);
  89. }
  90. const initialTokenizerPosition = tokenizer.position;
  91. const initialTokenizerLineNumber = tokenizer.lineNumber;
  92. const firstLine = readLine(tokenizer);
  93. const firstLineArray = firstLine.trim().split(/\s+/g)
  94. const firstLineLength = firstLineArray.length;
  95. // optional columns are in order "integer string float string".
  96. // Use this to find out which column is missing or empty
  97. for (let i = 6; i < firstLineLength; i++) {
  98. if (!isNaN(Number(firstLineArray[i]))) {
  99. if (firstLineArray[i].indexOf('.') === -1) {
  100. hasSubst_id = true;
  101. } else {
  102. hasCharge = true;
  103. }
  104. } else if (isNaN(Number(firstLineArray[i]))) {
  105. if (!isStatus_bit(firstLineArray[i])) {
  106. hasSubst_name = true;
  107. } else {
  108. hasStatus_bit = true;
  109. }
  110. }
  111. }
  112. // required columns
  113. const atom_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  114. const atom_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  115. const xTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  116. const yTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  117. const zTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  118. const atom_typeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  119. const atom_idTokenColumn = TokenColumn(atom_idTokens);
  120. const atom_nameTokenColumn = TokenColumn(atom_nameTokens);
  121. const xTokenColumn = TokenColumn(xTokens);
  122. const yTokenColumn = TokenColumn(yTokens);
  123. const zTokenColumn = TokenColumn(zTokens);
  124. const atom_typeColumn = TokenColumn(atom_typeTokens);
  125. // optional columns
  126. const subst_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  127. const subst_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  128. const chargeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  129. const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
  130. const subst_idTokenColumn = TokenColumn(subst_idTokens);
  131. const subst_nameTokenColumn = TokenColumn(subst_nameTokens);
  132. const chargeTokenColumn = TokenColumn(chargeTokens);
  133. const status_bitTokenColumn = TokenColumn(status_bitTokens);
  134. const undefFloat = Column.Undefined(molecule.num_atoms, Column.Schema.float);
  135. const undefInt = Column.Undefined(molecule.num_atoms, Column.Schema.int);
  136. const undefStr = Column.Undefined(molecule.num_atoms, Column.Schema.str);
  137. let numOfColumn = 6;
  138. if (hasSubst_id) { numOfColumn++ }
  139. if (hasSubst_name) { numOfColumn++ }
  140. if (hasCharge) { numOfColumn++ }
  141. if (hasStatus_bit) { numOfColumn++ }
  142. tokenizer.position = initialTokenizerPosition;
  143. tokenizer.lineNumber = initialTokenizerLineNumber;
  144. const { length } = tokenizer;
  145. let linesAlreadyRead = 0;
  146. await chunkedSubtask(state.runtimeCtx, 100000, void 0, chunkSize => {
  147. const linesToRead = Math.min(molecule.num_atoms - linesAlreadyRead, chunkSize);
  148. for (let i = 0; i < linesToRead; i++) {
  149. let subst_idWritten = false;
  150. let subst_nameWritten = false;
  151. let chargeWritten = false;
  152. let status_bitWritten = false;
  153. for (let j = 0; j < numOfColumn; j++) {
  154. skipWhitespace(tokenizer);
  155. tokenizer.tokenStart = tokenizer.position;
  156. eatValue(tokenizer);
  157. switch (j) {
  158. case 0:
  159. TokenBuilder.addUnchecked(atom_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  160. break;
  161. case 1:
  162. TokenBuilder.addUnchecked(atom_nameTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  163. break;
  164. case 2:
  165. TokenBuilder.addUnchecked(xTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  166. break;
  167. case 3:
  168. TokenBuilder.addUnchecked(yTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  169. break;
  170. case 4:
  171. TokenBuilder.addUnchecked(zTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  172. break;
  173. case 5:
  174. TokenBuilder.addUnchecked(atom_typeTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  175. break;
  176. default:
  177. if (hasSubst_id === true && subst_idWritten === false) {
  178. TokenBuilder.addUnchecked(subst_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  179. subst_idWritten = true;
  180. } else if (hasSubst_name === true && subst_nameWritten === false) {
  181. TokenBuilder.addUnchecked(subst_nameTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  182. subst_nameWritten = true;
  183. } else if (hasCharge === true && chargeWritten === false) {
  184. TokenBuilder.addUnchecked(chargeTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  185. chargeWritten = true;
  186. } else if (hasStatus_bit === true && status_bitWritten === false) {
  187. TokenBuilder.addUnchecked(status_bitTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  188. status_bitWritten = true;
  189. }
  190. }
  191. }
  192. }
  193. linesAlreadyRead += linesToRead;
  194. return linesToRead;
  195. }, ctx => ctx.update({ message: 'Parsing...', current: tokenizer.position, max: length }));
  196. const ret = {
  197. count: molecule.num_atoms,
  198. atom_id: atom_idTokenColumn(Column.Schema.int),
  199. atom_name: atom_nameTokenColumn(Column.Schema.str),
  200. x: xTokenColumn(Column.Schema.float),
  201. y: yTokenColumn(Column.Schema.float),
  202. z: zTokenColumn(Column.Schema.float),
  203. atom_type: atom_typeColumn(Column.Schema.str),
  204. // optional columns
  205. subst_id: hasSubst_id ? subst_idTokenColumn(Column.Schema.int) : undefInt,
  206. subst_name: hasSubst_name ? subst_nameTokenColumn(Column.Schema.str) : undefStr,
  207. charge: hasCharge ? chargeTokenColumn(Column.Schema.float) : undefFloat,
  208. status_bit: hasStatus_bit ? status_bitTokenColumn(Column.Schema.str) : undefStr,
  209. };
  210. return ret;
  211. }
  212. async function handleBonds(state: State): Promise<Schema.Mol2Bonds> {
  213. const { tokenizer, molecule } = state;
  214. let hasStatus_bit = false;
  215. while (getTokenString(tokenizer) !== '@<TRIPOS>BOND') {
  216. markLine(tokenizer);
  217. }
  218. const initialTokenizerPosition = tokenizer.position;
  219. const initialTokenizerLineNumber = tokenizer.lineNumber;
  220. const firstLine = readLine(tokenizer);
  221. const firstLineArray = firstLine.trim().split(/\s+/g)
  222. const firstLineLength = firstLineArray.length;
  223. if (firstLineLength === 5) {
  224. hasStatus_bit = true;
  225. }
  226. // required columns
  227. const bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
  228. const origin_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
  229. const target_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
  230. const bondTypeTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
  231. const bond_idTokenColumn = TokenColumn(bond_idTokens);
  232. const origin_bond_idTokenColumn = TokenColumn(origin_bond_idTokens);
  233. const target_bond_idTokenColumn = TokenColumn(target_bond_idTokens);
  234. const bondTypeTokenColumn = TokenColumn(bondTypeTokens);
  235. // optional columns
  236. const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
  237. const status_bitTokenColumn = TokenColumn(status_bitTokens);
  238. const undefStr = Column.Undefined(molecule.num_bonds, Column.Schema.str);
  239. let numberOfColumn = 4;
  240. if (hasStatus_bit) { numberOfColumn++ }
  241. tokenizer.position = initialTokenizerPosition;
  242. tokenizer.lineNumber = initialTokenizerLineNumber;
  243. const { length } = tokenizer;
  244. let linesAlreadyRead = 0;
  245. await chunkedSubtask(state.runtimeCtx, 100000, void 0, chunkSize => {
  246. const linesToRead = Math.min(molecule.num_bonds - linesAlreadyRead, chunkSize);
  247. for (let i = 0; i < linesToRead; i++) {
  248. for (let j = 0; j < numberOfColumn; j++) {
  249. skipWhitespace(tokenizer);
  250. tokenizer.tokenStart = tokenizer.position;
  251. eatValue(tokenizer);
  252. switch (j) {
  253. case 0:
  254. TokenBuilder.addUnchecked(bond_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  255. break;
  256. case 1:
  257. TokenBuilder.addUnchecked(origin_bond_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  258. break;
  259. case 2:
  260. TokenBuilder.addUnchecked(target_bond_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  261. break;
  262. case 3:
  263. TokenBuilder.addUnchecked(bondTypeTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  264. break;
  265. default:
  266. TokenBuilder.addUnchecked(status_bitTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  267. break;
  268. }
  269. }
  270. }
  271. linesAlreadyRead += linesToRead;
  272. return linesToRead;
  273. }, ctx => ctx.update({ message: 'Parsing...', current: tokenizer.position, max: length }));
  274. const ret = {
  275. count: molecule.num_bonds,
  276. bond_id: bond_idTokenColumn(Column.Schema.int),
  277. origin_atom_id: origin_bond_idTokenColumn(Column.Schema.int),
  278. target_atom_id: target_bond_idTokenColumn(Column.Schema.int),
  279. bond_type: bondTypeTokenColumn(Column.Schema.str),
  280. // optional columns
  281. status_bits: hasStatus_bit ? status_bitTokenColumn(Column.Schema.str) : undefStr,
  282. };
  283. return ret;
  284. }
  285. async function parseInternal(data: string, ctx: RuntimeContext): Promise<Result<Schema.Mol2File>> {
  286. const tokenizer = Tokenizer(data);
  287. ctx.update({ message: 'Parsing...', current: 0, max: data.length });
  288. const structures: Schema.Mol2Structure[] = [];
  289. while (tokenizer.position < data.length) {
  290. const state = State(tokenizer, ctx);
  291. handleMolecule(state);
  292. const atoms = await handleAtoms(state);
  293. const bonds = await handleBonds(state);
  294. structures.push({ molecule: state.molecule, atoms, bonds });
  295. }
  296. const result: Schema.Mol2File = { structures };
  297. return Result.success(result);
  298. }
  299. export function parse(data: string) {
  300. return Task.create<Result<Schema.Mol2File>>('Parse MOL2', async ctx => {
  301. return await parseInternal(data, ctx);
  302. });
  303. }
  304. export default parse;