create-table.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. #!/usr/bin/env node
  2. /**
  3. * Copyright (c) 2018-2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
  4. *
  5. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  6. */
  7. import * as argparse from 'argparse';
  8. import * as util from 'util';
  9. import * as path from 'path';
  10. import * as fs from 'fs';
  11. import * as zlib from 'zlib';
  12. import fetch from 'node-fetch';
  13. require('util.promisify').shim();
  14. const readFile = util.promisify(fs.readFile);
  15. const writeFile = util.promisify(fs.writeFile);
  16. import { Progress } from '../../mol-task';
  17. import { Database, Table, DatabaseCollection } from '../../mol-data/db';
  18. import { CIF } from '../../mol-io/reader/cif';
  19. import { CifWriter } from '../../mol-io/writer/cif';
  20. import { CCD_Schema } from '../../mol-io/reader/cif/schema/ccd';
  21. import { SetUtils } from '../../mol-util/set';
  22. import { DefaultMap } from '../../mol-util/map';
  23. import { mmCIF_chemCompBond_schema } from '../../mol-io/reader/cif/schema/mmcif-extras';
  24. import { ccd_chemCompAtom_schema } from '../../mol-io/reader/cif/schema/ccd-extras';
  25. export async function ensureAvailable(path: string, url: string) {
  26. if (FORCE_DOWNLOAD || !fs.existsSync(path)) {
  27. console.log(`downloading ${url}...`);
  28. const data = await fetch(url);
  29. if (!fs.existsSync(DATA_DIR)) {
  30. fs.mkdirSync(DATA_DIR);
  31. }
  32. if (url.endsWith('.gz')) {
  33. await writeFile(path, zlib.gunzipSync(await data.buffer()));
  34. } else {
  35. await writeFile(path, await data.text());
  36. }
  37. console.log(`done downloading ${url}`);
  38. }
  39. }
  40. export async function ensureDataAvailable() {
  41. await ensureAvailable(CCD_PATH, CCD_URL);
  42. await ensureAvailable(PVCD_PATH, PVCD_URL);
  43. }
  44. export async function readFileAsCollection<S extends Database.Schema>(path: string, schema: S) {
  45. const parsed = await parseCif(await readFile(path, 'utf8'));
  46. return CIF.toDatabaseCollection(schema, parsed.result);
  47. }
  48. export async function readCCD() {
  49. return readFileAsCollection(CCD_PATH, CCD_Schema);
  50. }
  51. export async function readPVCD() {
  52. return readFileAsCollection(PVCD_PATH, CCD_Schema);
  53. }
  54. async function parseCif(data: string | Uint8Array) {
  55. const comp = CIF.parse(data);
  56. console.time('parse cif');
  57. const parsed = await comp.run(p => console.log(Progress.format(p)), 250);
  58. console.timeEnd('parse cif');
  59. if (parsed.isError) throw parsed;
  60. return parsed;
  61. }
  62. export function getEncodedCif(name: string, database: Database<Database.Schema>, binary = false) {
  63. const encoder = CifWriter.createEncoder({ binary, encoderName: 'mol*' });
  64. CifWriter.Encoder.writeDatabase(encoder, name, database);
  65. return encoder.getData();
  66. }
  67. type CCB = Table<CCD_Schema['chem_comp_bond']>
  68. type CCA = Table<CCD_Schema['chem_comp_atom']>
  69. function ccbKey(compId: string, atomId1: string, atomId2: string) {
  70. return atomId1 < atomId2 ? `${compId}:${atomId1}-${atomId2}` : `${compId}:${atomId2}-${atomId1}`;
  71. }
  72. function addChemCompBondToSet(set: Set<string>, ccb: CCB) {
  73. for (let i = 0, il = ccb._rowCount; i < il; ++i) {
  74. set.add(ccbKey(ccb.comp_id.value(i), ccb.atom_id_1.value(i), ccb.atom_id_2.value(i)));
  75. }
  76. return set;
  77. }
  78. function addChemCompAtomToSet(set: Set<string>, cca: CCA) {
  79. for (let i = 0, il = cca._rowCount; i < il; ++i) {
  80. set.add(cca.atom_id.value(i));
  81. }
  82. return set;
  83. }
  84. function checkAddingBondsFromPVCD(pvcd: DatabaseCollection<CCD_Schema>) {
  85. const ccbSetByParent = DefaultMap<string, Set<string>>(() => new Set());
  86. for (const k in pvcd) {
  87. const { chem_comp, chem_comp_bond } = pvcd[k];
  88. if (chem_comp_bond._rowCount) {
  89. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  90. if (parentIds.length === 0) {
  91. const set = ccbSetByParent.getDefault(chem_comp.id.value(0));
  92. addChemCompBondToSet(set, chem_comp_bond);
  93. } else {
  94. for (let i = 0, il = parentIds.length; i < il; ++i) {
  95. const parentId = parentIds[i];
  96. const set = ccbSetByParent.getDefault(parentId);
  97. addChemCompBondToSet(set, chem_comp_bond);
  98. }
  99. }
  100. }
  101. }
  102. for (const k in pvcd) {
  103. const { chem_comp, chem_comp_atom, chem_comp_bond } = pvcd[k];
  104. if (chem_comp_bond._rowCount) {
  105. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  106. if (parentIds.length > 0) {
  107. for (let i = 0, il = parentIds.length; i < il; ++i) {
  108. const entryBonds = addChemCompBondToSet(new Set<string>(), chem_comp_bond);
  109. const entryAtoms = addChemCompAtomToSet(new Set<string>(), chem_comp_atom);
  110. const extraBonds = SetUtils.difference(ccbSetByParent.get(parentIds[i])!, entryBonds);
  111. extraBonds.forEach(bk => {
  112. const [a1, a2] = bk.split('|');
  113. if (entryAtoms.has(a1) && entryAtoms.has(a2)) {
  114. console.error(`Adding all PVCD bonds would wrongly add bond ${bk} for ${k}`);
  115. }
  116. });
  117. }
  118. }
  119. }
  120. }
  121. }
  122. async function createBonds(atomsRequested: boolean) {
  123. await ensureDataAvailable();
  124. const ccd = await readCCD();
  125. const pvcd = await readPVCD();
  126. const ccbSet = new Set<string>();
  127. const comp_id: string[] = [];
  128. const atom_id_1: string[] = [];
  129. const atom_id_2: string[] = [];
  130. const value_order: typeof mmCIF_chemCompBond_schema['value_order']['T'][] = [];
  131. const pdbx_aromatic_flag: typeof mmCIF_chemCompBond_schema['pdbx_aromatic_flag']['T'][] = [];
  132. const pdbx_stereo_config: typeof mmCIF_chemCompBond_schema['pdbx_stereo_config']['T'][] = [];
  133. const molstar_protonation_variant: string[] = [];
  134. function addBonds(compId: string, ccb: CCB, protonationVariant: boolean) {
  135. for (let i = 0, il = ccb._rowCount; i < il; ++i) {
  136. const atomId1 = ccb.atom_id_1.value(i);
  137. const atomId2 = ccb.atom_id_2.value(i);
  138. const k = ccbKey(compId, atomId1, atomId2);
  139. if (!ccbSet.has(k)) {
  140. atom_id_1.push(atomId1);
  141. atom_id_2.push(atomId2);
  142. comp_id.push(compId);
  143. value_order.push(ccb.value_order.value(i));
  144. pdbx_aromatic_flag.push(ccb.pdbx_aromatic_flag.value(i));
  145. pdbx_stereo_config.push(ccb.pdbx_stereo_config.value(i));
  146. molstar_protonation_variant.push(protonationVariant ? 'Y' : 'N');
  147. ccbSet.add(k);
  148. }
  149. }
  150. }
  151. // check adding bonds from PVCD
  152. checkAddingBondsFromPVCD(pvcd);
  153. // add bonds from PVCD
  154. for (const k in pvcd) {
  155. const { chem_comp, chem_comp_bond } = pvcd[k];
  156. if (chem_comp_bond._rowCount) {
  157. const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0);
  158. if (parentIds.length === 0) {
  159. addBonds(chem_comp.id.value(0), chem_comp_bond, false);
  160. } else {
  161. for (let i = 0, il = parentIds.length; i < il; ++i) {
  162. addBonds(parentIds[i], chem_comp_bond, true);
  163. }
  164. }
  165. }
  166. }
  167. // add bonds from CCD
  168. for (const k in ccd) {
  169. const { chem_comp, chem_comp_bond } = ccd[k];
  170. if (chem_comp_bond._rowCount) {
  171. addBonds(chem_comp.id.value(0), chem_comp_bond, false);
  172. }
  173. }
  174. const bondTable = Table.ofArrays(mmCIF_chemCompBond_schema, {
  175. comp_id, atom_id_1, atom_id_2, value_order,
  176. pdbx_aromatic_flag, pdbx_stereo_config, molstar_protonation_variant
  177. });
  178. const bondDatabase = Database.ofTables(
  179. CCB_TABLE_NAME,
  180. { chem_comp_bond: mmCIF_chemCompBond_schema },
  181. { chem_comp_bond: bondTable }
  182. );
  183. return { bonds: bondDatabase, atoms: atomsRequested ? createAtoms(ccd) : void 0 };
  184. }
  185. function createAtoms(ccd: DatabaseCollection<CCD_Schema>) {
  186. const comp_id: string[] = [];
  187. const atom_id: string[] = [];
  188. const charge: number[] = [];
  189. const pdbx_stereo_config: typeof CCD_Schema.chem_comp_atom['pdbx_stereo_config']['T'][] = [];
  190. function addAtoms(compId: string, cca: CCA) {
  191. for (let i = 0, il = cca._rowCount; i < il; ++i) {
  192. atom_id.push(cca.atom_id.value(i));
  193. comp_id.push(compId);
  194. charge.push(cca.charge.value(i));
  195. pdbx_stereo_config.push(cca.pdbx_stereo_config.value(i));
  196. }
  197. }
  198. // add atoms from CCD
  199. for (const k in ccd) {
  200. const { chem_comp, chem_comp_atom } = ccd[k];
  201. if (chem_comp_atom._rowCount) {
  202. addAtoms(chem_comp.id.value(0), chem_comp_atom);
  203. }
  204. }
  205. const atomTable = Table.ofArrays(ccd_chemCompAtom_schema, {
  206. comp_id, atom_id, charge, pdbx_stereo_config
  207. });
  208. return Database.ofTables(
  209. CCA_TABLE_NAME,
  210. { chem_comp_atom: ccd_chemCompAtom_schema },
  211. { chem_comp_atom: atomTable }
  212. );
  213. }
  214. async function run(out: string, binary = false, ccaOut?: string) {
  215. const { bonds, atoms } = await createBonds(!!ccaOut);
  216. const ccbCif = getEncodedCif(CCB_TABLE_NAME, bonds, binary);
  217. if (!fs.existsSync(path.dirname(out))) {
  218. fs.mkdirSync(path.dirname(out));
  219. }
  220. writeFile(out, ccbCif);
  221. if (!!ccaOut) {
  222. const ccaCif = getEncodedCif(CCA_TABLE_NAME, atoms, binary);
  223. if (!fs.existsSync(path.dirname(ccaOut))) {
  224. fs.mkdirSync(path.dirname(ccaOut));
  225. }
  226. writeFile(ccaOut, ccaCif);
  227. }
  228. }
  229. const CCB_TABLE_NAME = 'CHEM_COMP_BONDS';
  230. const CCA_TABLE_NAME = 'CHEM_COMP_ATOMS';
  231. const DATA_DIR = path.join(__dirname, '..', '..', '..', '..', 'build/data');
  232. const CCD_PATH = path.join(DATA_DIR, 'components.cif');
  233. const PVCD_PATH = path.join(DATA_DIR, 'aa-variants-v1.cif');
  234. const CCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif';
  235. const PVCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/aa-variants-v1.cif';
  236. const parser = new argparse.ArgumentParser({
  237. addHelp: true,
  238. description: 'Create a cif file with one big table of all chem_comp_bond entries from the CCD and PVCD.'
  239. });
  240. parser.addArgument('out', {
  241. help: 'Generated file output path.'
  242. });
  243. parser.addArgument([ '--forceDownload', '-f' ], {
  244. action: 'storeTrue',
  245. help: 'Force download of CCD and PVCD.'
  246. });
  247. parser.addArgument([ '--binary', '-b' ], {
  248. action: 'storeTrue',
  249. help: 'Output as BinaryCIF.'
  250. });
  251. parser.addArgument(['--ccaOut', '-a'], {
  252. help: 'Optional generated file output path for chem_comp_atom data.',
  253. required: false
  254. });
  255. interface Args {
  256. out: string
  257. forceDownload?: boolean
  258. binary?: boolean,
  259. ccaOut?: string
  260. }
  261. const args: Args = parser.parseArgs();
  262. const FORCE_DOWNLOAD = args.forceDownload;
  263. run(args.out, args.binary, args.ccaOut);