create-table.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. #!/usr/bin/env node
  2. /**
  3. * Copyright (c) 2018-2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
  4. *
  5. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  6. */
  7. import * as argparse from 'argparse';
  8. import * as util from 'util';
  9. import * as path from 'path';
  10. import * as fs from 'fs';
  11. require('util.promisify').shim();
  12. const writeFile = util.promisify(fs.writeFile);
  13. import { Database, Table, DatabaseCollection } from '../../mol-data/db';
  14. import { CCD_Schema } from '../../mol-io/reader/cif/schema/ccd';
  15. import { SetUtils } from '../../mol-util/set';
  16. import { DefaultMap } from '../../mol-util/map';
  17. import { mmCIF_chemCompBond_schema } from '../../mol-io/reader/cif/schema/mmcif-extras';
  18. import { ccd_chemCompAtom_schema } from '../../mol-io/reader/cif/schema/ccd-extras';
  19. import { DefaultDataOptions, ensureDataAvailable, getEncodedCif, readCCD, readPVCD } from './util';
  20. type CCB = Table<CCD_Schema['chem_comp_bond']>
  21. type CCA = Table<CCD_Schema['chem_comp_atom']>
  22. function ccbKey(compId: string, atomId1: string, atomId2: string) {
  23. return atomId1 < atomId2 ? `${compId}:${atomId1}-${atomId2}` : `${compId}:${atomId2}-${atomId1}`;
  24. }
  25. function ccaKey(compId: string, atomId: string) {
  26. return `${compId}:${atomId}`;
  27. }
  28. function addChemCompBondToSet(set: Set<string>, ccb: CCB) {
  29. for (let i = 0, il = ccb._rowCount; i < il; ++i) {
  30. set.add(ccbKey(ccb.comp_id.value(i), ccb.atom_id_1.value(i), ccb.atom_id_2.value(i)));
  31. }
  32. return set;
  33. }
  34. function addChemCompAtomToSet(set: Set<string>, cca: CCA) {
  35. for (let i = 0, il = cca._rowCount; i < il; ++i) {
  36. set.add(ccaKey(cca.comp_id.value(i), cca.atom_id.value(i)));
  37. }
  38. return set;
  39. }
  40. function checkAddingBondsFromPVCD(pvcd: DatabaseCollection<CCD_Schema>) {
  41. const ccbSetByParent = DefaultMap<string, Set<string>>(() => new Set());
  42. for (const k in pvcd) {
  43. const { chem_comp, chem_comp_bond } = pvcd[k];
  44. if (chem_comp_bond._rowCount) {
  45. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  46. if (parentIds.length === 0) {
  47. const set = ccbSetByParent.getDefault(chem_comp.id.value(0));
  48. addChemCompBondToSet(set, chem_comp_bond);
  49. } else {
  50. for (let i = 0, il = parentIds.length; i < il; ++i) {
  51. const parentId = parentIds[i];
  52. const set = ccbSetByParent.getDefault(parentId);
  53. addChemCompBondToSet(set, chem_comp_bond);
  54. }
  55. }
  56. }
  57. }
  58. for (const k in pvcd) {
  59. const { chem_comp, chem_comp_atom, chem_comp_bond } = pvcd[k];
  60. if (chem_comp_bond._rowCount) {
  61. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  62. if (parentIds.length > 0) {
  63. for (let i = 0, il = parentIds.length; i < il; ++i) {
  64. const entryBonds = addChemCompBondToSet(new Set<string>(), chem_comp_bond);
  65. const entryAtoms = addChemCompAtomToSet(new Set<string>(), chem_comp_atom);
  66. const extraBonds = SetUtils.difference(ccbSetByParent.get(parentIds[i])!, entryBonds);
  67. extraBonds.forEach(bk => {
  68. const [a1, a2] = bk.split('|');
  69. if (entryAtoms.has(a1) && entryAtoms.has(a2)) {
  70. console.error(`Adding all PVCD bonds would wrongly add bond ${bk} for ${k}`);
  71. }
  72. });
  73. }
  74. }
  75. }
  76. }
  77. }
  78. function checkAddingAtomsFromPVCD(pvcd: DatabaseCollection<CCD_Schema>) {
  79. const ccaSetByParent = DefaultMap<string, Set<string>>(() => new Set());
  80. for (const k in pvcd) {
  81. const { chem_comp, chem_comp_atom } = pvcd[k];
  82. if (chem_comp_atom._rowCount) {
  83. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  84. if (parentIds.length === 0) {
  85. const set = ccaSetByParent.getDefault(chem_comp.id.value(0));
  86. addChemCompAtomToSet(set, chem_comp_atom);
  87. } else {
  88. for (let i = 0, il = parentIds.length; i < il; ++i) {
  89. const parentId = parentIds[i];
  90. const set = ccaSetByParent.getDefault(parentId);
  91. addChemCompAtomToSet(set, chem_comp_atom);
  92. }
  93. }
  94. }
  95. }
  96. }
  97. async function createBonds(
  98. ccd: DatabaseCollection<CCD_Schema>,
  99. pvcd: DatabaseCollection<CCD_Schema>,
  100. atomsRequested: boolean
  101. ) {
  102. const ccbSet = new Set<string>();
  103. const comp_id: string[] = [];
  104. const atom_id_1: string[] = [];
  105. const atom_id_2: string[] = [];
  106. const value_order: typeof mmCIF_chemCompBond_schema['value_order']['T'][] = [];
  107. const pdbx_aromatic_flag: typeof mmCIF_chemCompBond_schema['pdbx_aromatic_flag']['T'][] = [];
  108. const pdbx_stereo_config: typeof mmCIF_chemCompBond_schema['pdbx_stereo_config']['T'][] = [];
  109. const molstar_protonation_variant: string[] = [];
  110. function addBonds(compId: string, ccb: CCB, protonationVariant: boolean) {
  111. for (let i = 0, il = ccb._rowCount; i < il; ++i) {
  112. const atomId1 = ccb.atom_id_1.value(i);
  113. const atomId2 = ccb.atom_id_2.value(i);
  114. const k = ccbKey(compId, atomId1, atomId2);
  115. if (!ccbSet.has(k)) {
  116. atom_id_1.push(atomId1);
  117. atom_id_2.push(atomId2);
  118. comp_id.push(compId);
  119. value_order.push(ccb.value_order.value(i));
  120. pdbx_aromatic_flag.push(ccb.pdbx_aromatic_flag.value(i));
  121. pdbx_stereo_config.push(ccb.pdbx_stereo_config.value(i));
  122. molstar_protonation_variant.push(protonationVariant ? 'Y' : 'N');
  123. ccbSet.add(k);
  124. }
  125. }
  126. }
  127. // check adding bonds from PVCD
  128. checkAddingBondsFromPVCD(pvcd);
  129. // add bonds from PVCD
  130. for (const k in pvcd) {
  131. const { chem_comp, chem_comp_bond } = pvcd[k];
  132. if (chem_comp_bond._rowCount) {
  133. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  134. if (parentIds.length === 0) {
  135. addBonds(chem_comp.id.value(0), chem_comp_bond, false);
  136. } else {
  137. for (let i = 0, il = parentIds.length; i < il; ++i) {
  138. addBonds(parentIds[i], chem_comp_bond, true);
  139. }
  140. }
  141. }
  142. }
  143. // add bonds from CCD
  144. for (const k in ccd) {
  145. const { chem_comp, chem_comp_bond } = ccd[k];
  146. if (chem_comp_bond._rowCount) {
  147. addBonds(chem_comp.id.value(0), chem_comp_bond, false);
  148. }
  149. }
  150. const bondTable = Table.ofArrays(mmCIF_chemCompBond_schema, {
  151. comp_id, atom_id_1, atom_id_2, value_order,
  152. pdbx_aromatic_flag, pdbx_stereo_config, molstar_protonation_variant
  153. });
  154. const bondDatabase = Database.ofTables(
  155. CCB_TABLE_NAME,
  156. { chem_comp_bond: mmCIF_chemCompBond_schema },
  157. { chem_comp_bond: bondTable }
  158. );
  159. return { bonds: bondDatabase, atoms: atomsRequested ? createAtoms(ccd, pvcd) : void 0 };
  160. }
  161. function createAtoms(ccd: DatabaseCollection<CCD_Schema>, pvcd: DatabaseCollection<CCD_Schema>) {
  162. const ccaSet = new Set<string>();
  163. const comp_id: string[] = [];
  164. const atom_id: string[] = [];
  165. const charge: number[] = [];
  166. const pdbx_stereo_config: typeof CCD_Schema.chem_comp_atom['pdbx_stereo_config']['T'][] = [];
  167. function addAtoms(compId: string, cca: CCA) {
  168. for (let i = 0, il = cca._rowCount; i < il; ++i) {
  169. const atomId = cca.atom_id.value(i);
  170. const k = ccaKey(compId, atomId);
  171. if (!ccaSet.has(k)) {
  172. atom_id.push(atomId);
  173. comp_id.push(compId);
  174. charge.push(cca.charge.value(i));
  175. pdbx_stereo_config.push(cca.pdbx_stereo_config.value(i));
  176. ccaSet.add(k);
  177. }
  178. }
  179. }
  180. // check adding atoms from PVCD
  181. checkAddingAtomsFromPVCD(pvcd);
  182. // add atoms from PVCD
  183. for (const k in pvcd) {
  184. const { chem_comp, chem_comp_atom } = pvcd[k];
  185. if (chem_comp_atom._rowCount) {
  186. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  187. if (parentIds.length === 0) {
  188. addAtoms(chem_comp.id.value(0), chem_comp_atom);
  189. } else {
  190. for (let i = 0, il = parentIds.length; i < il; ++i) {
  191. addAtoms(parentIds[i], chem_comp_atom);
  192. }
  193. }
  194. }
  195. }
  196. // add atoms from CCD
  197. for (const k in ccd) {
  198. const { chem_comp, chem_comp_atom } = ccd[k];
  199. if (chem_comp_atom._rowCount) {
  200. addAtoms(chem_comp.id.value(0), chem_comp_atom);
  201. }
  202. }
  203. const atomTable = Table.ofArrays(ccd_chemCompAtom_schema, {
  204. comp_id, atom_id, charge, pdbx_stereo_config
  205. });
  206. return Database.ofTables(
  207. CCA_TABLE_NAME,
  208. { chem_comp_atom: ccd_chemCompAtom_schema },
  209. { chem_comp_atom: atomTable }
  210. );
  211. }
  212. async function run(out: string, binary = false, options = DefaultDataOptions, ccaOut?: string) {
  213. await ensureDataAvailable(options);
  214. const ccd = await readCCD();
  215. const pvcd = await readPVCD();
  216. const { bonds, atoms } = await createBonds(ccd, pvcd, !!ccaOut);
  217. const ccbCif = getEncodedCif(CCB_TABLE_NAME, bonds, binary);
  218. if (!fs.existsSync(path.dirname(out))) {
  219. fs.mkdirSync(path.dirname(out));
  220. }
  221. writeFile(out, ccbCif);
  222. if (!!ccaOut) {
  223. const ccaCif = getEncodedCif(CCA_TABLE_NAME, atoms, binary);
  224. if (!fs.existsSync(path.dirname(ccaOut))) {
  225. fs.mkdirSync(path.dirname(ccaOut));
  226. }
  227. writeFile(ccaOut, ccaCif);
  228. }
  229. }
  230. const CCB_TABLE_NAME = 'CHEM_COMP_BONDS';
  231. const CCA_TABLE_NAME = 'CHEM_COMP_ATOMS';
  232. const parser = new argparse.ArgumentParser({
  233. add_help: true,
  234. description: 'Create a cif file with one big table of all chem_comp_bond entries from the CCD and PVCD.'
  235. });
  236. parser.add_argument('out', {
  237. help: 'Generated file output path.'
  238. });
  239. parser.add_argument('--forceDownload', '-f', {
  240. action: 'store_true',
  241. help: 'Force download of CCD and PVCD.'
  242. });
  243. parser.add_argument('--binary', '-b', {
  244. action: 'store_true',
  245. help: 'Output as BinaryCIF.'
  246. });
  247. parser.add_argument('--ccaOut', '-a', {
  248. help: 'Optional generated file output path for chem_comp_atom data.',
  249. required: false
  250. });
  251. parser.add_argument('--ccdUrl', '-a', {
  252. help: 'Fetch the CCD from a custom URL. This forces download of the CCD.',
  253. required: false
  254. });
  255. parser.add_argument('--pvcdUrl', '-a', {
  256. help: 'Fetch the PVCD from a custom URL. This forces download of the PVCD.',
  257. required: false
  258. });
  259. interface Args {
  260. out: string,
  261. forceDownload?: boolean,
  262. binary?: boolean,
  263. ccaOut?: string,
  264. ccdUrl?: string,
  265. pvcdUrl?: string
  266. }
  267. const args: Args = parser.parse_args();
  268. run(args.out, args.binary, { forceDownload: args.forceDownload, ccdUrl: args.ccdUrl, pvcdUrl: args.pvcdUrl }, args.ccaOut);