parser.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. // NOTES
  2. //When want to created undefined string column, must use
  3. // undefStr = UndefinedColumn(molecule.num_atoms, ColumnType.str)
  4. // but not
  5. // const undefPooledStr = UndefinedColumn(molecule.num_atoms, ColumnType.pooledStr);
  6. // because latter actuall return a column of zeros
  7. import { Column } from 'mol-data/db'
  8. import { TokenBuilder, Tokenizer } from '../common/text/tokenizer'
  9. import TokenColumn from '../common/text/column/token'
  10. import * as Schema from './schema'
  11. import Result from '../result'
  12. import Computation from 'mol-util/computation'
  13. interface State {
  14. tokenizer: Tokenizer,
  15. molecule: Schema.Molecule,
  16. chunker: Computation.Chunker
  17. }
  18. function createEmptyMolecule(): Schema.Molecule {
  19. return {
  20. mol_name: '',
  21. num_atoms: 0,
  22. num_bonds: 0,
  23. num_subst: 0,
  24. num_feat: 0,
  25. num_sets: 0,
  26. mol_type: '',
  27. charge_type: '',
  28. status_bits:'',
  29. mol_comment: ''
  30. };
  31. }
  32. function State(tokenizer: Tokenizer, ctx: Computation.Context): State {
  33. return {
  34. tokenizer,
  35. molecule: createEmptyMolecule(),
  36. chunker: Computation.chunker(ctx, 100000)
  37. };
  38. }
  39. function handleMolecule(state: State) {
  40. const { tokenizer, molecule } = state;
  41. Tokenizer.markLine(tokenizer);
  42. Tokenizer.markLine(tokenizer);
  43. let name = Tokenizer.getTokenString(tokenizer);
  44. molecule.mol_name = name;
  45. Tokenizer.markLine(tokenizer);
  46. const values = Tokenizer.getTokenString(tokenizer).trim().split(/\s+/g);
  47. molecule.num_atoms = parseInt(values[0]) ? parseInt(values[1]) : 0;
  48. molecule.num_bonds = parseInt(values[1]) ? parseInt(values[1]) : 0;
  49. molecule.num_subst = parseInt(values[2]) ? parseInt(values[1]) : 0;
  50. molecule.num_feat = parseInt(values[3]) ? parseInt(values[1]) : 0;
  51. molecule.num_sets = parseInt(values[4]) ? parseInt(values[1]) : 0;
  52. Tokenizer.markLine(tokenizer);
  53. molecule.mol_type = Tokenizer.getTokenString(tokenizer);
  54. Tokenizer.markLine(tokenizer);
  55. molecule.charge_type = Tokenizer.getTokenString(tokenizer);
  56. Tokenizer.markLine(tokenizer);
  57. if(Tokenizer.getTokenString(tokenizer) == ''){return}
  58. else{molecule.status_bits = Tokenizer.getTokenString(tokenizer)}
  59. Tokenizer.markLine(tokenizer);
  60. if(Tokenizer.getTokenString(tokenizer) == ''){return}
  61. else{molecule.mol_comment = Tokenizer.getTokenString(tokenizer)}
  62. }
  63. function isStatus_bit(aString: String): Boolean{
  64. if(aString.includes('DSPMOD') || aString.includes('TYPECOL') || aString.includes('CAP')
  65. || aString.includes('BACKBONE') || aString.includes('DICT') || aString.includes('ESSENTIAL')
  66. || aString.includes('WATER') || aString.includes('DIRECT')){
  67. return true;
  68. }
  69. return false;
  70. }
  71. async function handleAtoms(state: State): Promise<Schema.Atoms> {
  72. const { tokenizer, molecule } = state;
  73. let hasSubst_id = false;
  74. let hasSubst_name = false;
  75. let hasCharge = false;
  76. let hasStatus_bit = false;
  77. // skip empty lines and '@<TRIPOS>ATOM'
  78. while(Tokenizer.getTokenString(tokenizer) != '@<TRIPOS>ATOM'){
  79. Tokenizer.markLine(tokenizer);
  80. }
  81. const initialTokenizerPosition = tokenizer.position;
  82. const initialTOkenizerLineNumber = tokenizer.lineNumber;
  83. const firstLine = Tokenizer.readLine(tokenizer);
  84. const firstLineArray = firstLine.trim().split(/\s+/g)
  85. const firstLineLength = firstLineArray.length;
  86. // optionals are in order "integer string float string". Use this to find out which column is missing or empty
  87. for(let i = 6; i < firstLineLength; i++){
  88. if(!isNaN(Number(firstLineArray[i]))){
  89. if(firstLineArray[i].indexOf('.') == -1){
  90. hasSubst_id = true;
  91. }else{
  92. hasCharge = true;
  93. }
  94. }else if(isNaN(Number(firstLineArray[i]))){
  95. if(!isStatus_bit(firstLineArray[i])){
  96. hasSubst_name = true;
  97. }else{
  98. hasStatus_bit = true;
  99. }
  100. }
  101. }
  102. const atom_idTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  103. const atom_nameTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);;
  104. const xTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  105. const yTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  106. const zTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  107. const atom_typeTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  108. // optionals
  109. const subst_idTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  110. const subst_nameTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  111. const chargeTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  112. const status_bitTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
  113. const atom_idTokenColumn = TokenColumn(atom_idTokens);
  114. const atom_nameTokenColumn = TokenColumn(atom_nameTokens);
  115. const xTokenColumn = TokenColumn(xTokens);
  116. const yTokenColumn = TokenColumn(yTokens);
  117. const zTokenColumn = TokenColumn(zTokens);
  118. const atom_typeColumn = TokenColumn(atom_typeTokens);
  119. // optionals
  120. const subst_idTokenColumn = TokenColumn(subst_idTokens);
  121. const subst_nameTokenColumn = TokenColumn(subst_nameTokens);
  122. const chargeTokenColumn = TokenColumn(chargeTokens);
  123. const status_bitTokenColumn = TokenColumn(status_bitTokens);
  124. const undefFloat = Column.Undefined(molecule.num_atoms, Column.Schema.float);
  125. const undefInt = Column.Undefined(molecule.num_atoms, Column.Schema.int);
  126. const undefStr = Column.Undefined(molecule.num_atoms, Column.Schema.str);
  127. let numOfColumn = 5;
  128. if(hasSubst_id){numOfColumn++}
  129. if(hasSubst_name){numOfColumn++}
  130. if(hasCharge){numOfColumn++}
  131. if(hasStatus_bit){numOfColumn++}
  132. tokenizer.position = initialTokenizerPosition;
  133. tokenizer.lineNumber = initialTOkenizerLineNumber;
  134. for(let i = 0; i < molecule.num_atoms; i++){
  135. let subst_idWritten = false;
  136. let subst_nameWritten = false;
  137. let chargeWritten = false;
  138. let status_bitWritten = false;
  139. for(let j = 0; j < numOfColumn; j++){
  140. Tokenizer.skipWhitespace(tokenizer);
  141. Tokenizer.eatValue(tokenizer);
  142. switch(j){
  143. case 0:
  144. TokenBuilder.addUnchecked(atom_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  145. case 1:
  146. TokenBuilder.addUnchecked(atom_nameTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  147. case 2:
  148. TokenBuilder.addUnchecked(xTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  149. case 3:
  150. TokenBuilder.addUnchecked(yTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  151. case 4:
  152. TokenBuilder.addUnchecked(zTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  153. case 5:
  154. TokenBuilder.addUnchecked(atom_typeTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  155. default:
  156. if(hasSubst_id == true && subst_idWritten == false){
  157. TokenBuilder.addUnchecked(subst_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  158. subst_idWritten = true;
  159. }else if(hasSubst_name == true && subst_nameWritten == false){
  160. TokenBuilder.addUnchecked(subst_nameTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  161. subst_nameWritten = true;
  162. }else if(hasCharge == true && chargeWritten == false){
  163. TokenBuilder.addUnchecked(chargeTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  164. chargeWritten = true;
  165. }else if(hasStatus_bit == true && status_bitWritten == false){
  166. TokenBuilder.addUnchecked(status_bitTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  167. status_bitWritten = true;
  168. }
  169. }
  170. }
  171. }
  172. const ret = {
  173. count: molecule.num_atoms,
  174. atom_id: atom_idTokenColumn(Column.Schema.int),
  175. atom_name: atom_nameTokenColumn(Column.Schema.str),
  176. x: xTokenColumn(Column.Schema.float),
  177. y: yTokenColumn(Column.Schema.float),
  178. z: zTokenColumn(Column.Schema.float),
  179. atom_type: atom_typeColumn(Column.Schema.str),
  180. // optional properties
  181. subst_id: hasSubst_id ? subst_idTokenColumn(Column.Schema.int) : undefInt,
  182. subst_name: hasSubst_name ? subst_nameTokenColumn(Column.Schema.str) : undefStr,
  183. charge: hasCharge ? chargeTokenColumn(Column.Schema.float) : undefFloat,
  184. status_bit: hasStatus_bit ? status_bitTokenColumn(Column.Schema.str) : undefStr,
  185. };
  186. return ret;
  187. }
  188. async function handleBonds(state: State): Promise<Schema.Bonds> {
  189. const { tokenizer, molecule } = state;
  190. let hasStatus_bit = false;
  191. while(Tokenizer.getTokenString(tokenizer) != '@<TRIPOS>BOND'){
  192. Tokenizer.markLine(tokenizer);
  193. }
  194. const initialTokenizerPosition = tokenizer.position;
  195. const initialTokenizerLineNumber = tokenizer.lineNumber;
  196. const firstLine = Tokenizer.readLine(tokenizer);
  197. const firstLineArray = firstLine.trim().split(/\s+/g)
  198. const firstLineLength = firstLineArray.length;
  199. if(firstLineLength == 5){
  200. hasStatus_bit = true;
  201. }
  202. const bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
  203. const origin_bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
  204. const target_bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
  205. const bondTypeTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
  206. // optional
  207. const status_bitTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
  208. const bond_idTokenColumn = TokenColumn(bond_idTokens);
  209. const origin_bond_idTokenColumn = TokenColumn(origin_bond_idTokens);
  210. const target_bond_idTokenColumn = TokenColumn(target_bond_idTokens);
  211. const bondTypeTokenColumn = TokenColumn(bondTypeTokens);
  212. // optional
  213. const status_bitTokenColumn = TokenColumn(status_bitTokens);
  214. const undefStr = Column.Undefined(molecule.num_bonds, Column.Schema.str);
  215. let numberOfColumn = 4;
  216. if(hasStatus_bit){numberOfColumn++}
  217. tokenizer.position = initialTokenizerPosition;
  218. tokenizer.lineNumber = initialTokenizerLineNumber;
  219. for(let i = 0; i < molecule.num_bonds; i++){
  220. for(let j = 0; j < numberOfColumn; j++){
  221. Tokenizer.skipWhitespace(tokenizer);
  222. Tokenizer.eatValue(tokenizer);
  223. switch(j){
  224. case 0:
  225. TokenBuilder.addUnchecked(bond_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  226. case 1:
  227. TokenBuilder.addUnchecked(origin_bond_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  228. case 2:
  229. TokenBuilder.addUnchecked(target_bond_idTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  230. case 3:
  231. TokenBuilder.addUnchecked(bondTypeTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  232. default:
  233. TokenBuilder.addUnchecked(status_bitTokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  234. }
  235. }
  236. }
  237. const ret = {
  238. count: molecule.num_bonds,
  239. bond_id: bond_idTokenColumn(Column.Schema.int),
  240. origin_atom_id: origin_bond_idTokenColumn(Column.Schema.int),
  241. target_atom_id: target_bond_idTokenColumn(Column.Schema.int),
  242. bond_type: bondTypeTokenColumn(Column.Schema.str),
  243. status_bits: hasStatus_bit ? status_bitTokenColumn(Column.Schema.str) : undefStr,
  244. };
  245. return ret;
  246. }
  247. async function parseInternal(data: string, ctx: Computation.Context): Promise<Result<Schema.File>> {
  248. const tokenizer = Tokenizer(data);
  249. ctx.update({ message: 'Parsing...', current: 0, max: data.length });
  250. const structures: Schema.Structure[] = [];
  251. while (tokenizer.position < data.length) {
  252. const state = State(tokenizer, ctx);
  253. handleMolecule(state);
  254. const atoms = await handleAtoms(state);
  255. const bonds = await handleBonds(state);
  256. structures.push({ molecule: state.molecule, atoms, bonds });
  257. }
  258. const result: Schema.File = { structures };
  259. return Result.success(result);
  260. }
  261. export function parse(data: string) {
  262. return Computation.create<Result<Schema.File>>(async ctx => {
  263. return await parseInternal(data, ctx);
  264. });
  265. }
  266. export default parse;