Explorar el Código

Merge pull request #88 from McMenemy/extract_ion_names_from_ccd

Extract ion names from ccd
Alexander Rose hace 4 años
padre
commit
c119a1bc21

+ 5 - 0
README.md

@@ -99,6 +99,11 @@ and navigate to `build/viewer`
 
     node lib/commonjs/cli/lipid-params -o src/mol-model/structure/model/types/lipids.ts
 
+**Ion names**
+
+    node --max-old-space-size=4096 lib/commonjs/cli/chem-comp-dict/create-ions.js src/mol-model/structure/model/types/ions.ts
+
+
 **GraphQL schemas**
 
     node node_modules//@graphql-codegen/cli/bin -c src/extensions/rcsb/graphql/codegen.yml

+ 74 - 0
src/cli/chem-comp-dict/create-ions.ts

@@ -0,0 +1,74 @@
+#!/usr/bin/env node
+/**
+ * Copyright (c) 2018-2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Josh McMenemy <josh.mcmenemy@gmail.com>
+ */
+
+import * as argparse from 'argparse';
+import * as path from 'path';
+import util from 'util';
+import fs from 'fs';
+require('util.promisify').shim();
+const writeFile = util.promisify(fs.writeFile);
+
+import { DatabaseCollection } from '../../mol-data/db';
+import { CCD_Schema } from '../../mol-io/reader/cif/schema/ccd';
+import { ensureDataAvailable, readCCD } from './create-table';
+
+
+function extractIonNames(ccd: DatabaseCollection<CCD_Schema>) {
+    const ionNames: string[] = [];
+    for (const k in ccd) {
+        const {chem_comp} = ccd[k];
+        if (chem_comp.name.value(0).toUpperCase().includes(' ION')) {
+            ionNames.push(chem_comp.id.value(0));
+        }
+    }
+    // these are extra ions that don't have ION in their name
+    ionNames.push('NCO', 'OHX');
+    return ionNames;
+}
+
+function writeIonNamesFile(filePath: string, ionNames: string[]) {
+    const output = `/**
+* Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+*
+* Code-generated ion names params file. Names extracted from CCD components.
+*
+* @author molstar/chem-comp-dict/create-table cli
+*/
+
+export const IonNames = new Set(${JSON.stringify(ionNames).replace(/"/g, "'").replace(/,/g, ', ')});
+`;
+    writeFile(filePath, output);
+}
+
+async function run(out: string, forceDownload = false) {
+    await ensureDataAvailable(forceDownload);
+    const ccd = await readCCD();
+    const ionNames = extractIonNames(ccd);
+    if (!fs.existsSync(path.dirname(out))) {
+        fs.mkdirSync(path.dirname(out));
+    }
+    writeIonNamesFile(out, ionNames);
+}
+
+const parser = new argparse.ArgumentParser({
+    addHelp: true,
+    description: 'Extract and save IonNames from CCD.'
+});
+parser.addArgument('out', {
+    help: 'Generated file output path.'
+});
+parser.addArgument([ '--forceDownload', '-f' ], {
+    action: 'storeTrue',
+    help: 'Force download of CCD and PVCD.'
+});
+interface Args {
+    out: string,
+    forceDownload?: boolean,
+}
+const args: Args = parser.parseArgs();
+
+run(args.out, args.forceDownload);

+ 19 - 17
src/cli/chem-comp-dict/create-table.ts

@@ -25,8 +25,8 @@ import { DefaultMap } from '../../mol-util/map';
 import { mmCIF_chemCompBond_schema } from '../../mol-io/reader/cif/schema/mmcif-extras';
 import { ccd_chemCompAtom_schema } from '../../mol-io/reader/cif/schema/ccd-extras';
 
-export async function ensureAvailable(path: string, url: string) {
-    if (FORCE_DOWNLOAD || !fs.existsSync(path)) {
+export async function ensureAvailable(path: string, url: string, forceDownload = false) {
+    if (forceDownload || !fs.existsSync(path)) {
         console.log(`downloading ${url}...`);
         const data = await fetch(url);
         if (!fs.existsSync(DATA_DIR)) {
@@ -41,9 +41,9 @@ export async function ensureAvailable(path: string, url: string) {
     }
 }
 
-export async function ensureDataAvailable() {
-    await ensureAvailable(CCD_PATH, CCD_URL);
-    await ensureAvailable(PVCD_PATH, PVCD_URL);
+export async function ensureDataAvailable(forceDownload = false) {
+    await ensureAvailable(CCD_PATH, CCD_URL, forceDownload);
+    await ensureAvailable(PVCD_PATH, PVCD_URL, forceDownload);
 }
 
 export async function readFileAsCollection<S extends Database.Schema>(path: string, schema: S) {
@@ -136,11 +136,11 @@ function checkAddingBondsFromPVCD(pvcd: DatabaseCollection<CCD_Schema>) {
     }
 }
 
-async function createBonds(atomsRequested: boolean) {
-    await ensureDataAvailable();
-    const ccd = await readCCD();
-    const pvcd = await readPVCD();
-
+async function createBonds(
+    ccd: DatabaseCollection<CCD_Schema>,
+    pvcd: DatabaseCollection<CCD_Schema>,
+    atomsRequested: boolean
+) {
     const ccbSet = new Set<string>();
 
     const comp_id: string[] = [];
@@ -243,8 +243,12 @@ function createAtoms(ccd: DatabaseCollection<CCD_Schema>) {
     );
 }
 
-async function run(out: string, binary = false, ccaOut?: string) {
-    const { bonds, atoms } = await createBonds(!!ccaOut);
+async function run(out: string, binary = false, forceDownload = false, ccaOut?: string) {
+    await ensureDataAvailable(forceDownload);
+    const ccd = await readCCD();
+    const pvcd = await readPVCD();
+
+    const { bonds, atoms } = await createBonds(ccd, pvcd, !!ccaOut);
 
     const ccbCif = getEncodedCif(CCB_TABLE_NAME, bonds, binary);
     if (!fs.existsSync(path.dirname(out))) {
@@ -290,13 +294,11 @@ parser.addArgument(['--ccaOut', '-a'], {
     required: false
 });
 interface Args {
-    out: string
-    forceDownload?: boolean
+    out: string,
+    forceDownload?: boolean,
     binary?: boolean,
     ccaOut?: string
 }
 const args: Args = parser.parseArgs();
 
-const FORCE_DOWNLOAD = args.forceDownload;
-
-run(args.out, args.binary, args.ccaOut);
+run(args.out, args.binary, args.forceDownload, args.ccaOut);

+ 1 - 36
src/mol-model/structure/model/types.ts

@@ -11,6 +11,7 @@ import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif';
 import { SetUtils } from '../../../mol-util/set';
 import { EntitySubtype, ChemicalComponent } from './properties/common';
 import { LipidNames } from './types/lipids';
+import { IonNames } from './types/ions';
 import { mmCIF_chemComp_schema } from '../../../mol-io/reader/cif/schema/mmcif-extras';
 
 const _esCache = (function () {
@@ -432,42 +433,6 @@ export function isProtein(moleculeType: MoleculeType) {
     return moleculeType === MoleculeType.Protein;
 }
 
-/**
- * TODO write script that read CCD and outputs list of ion names
- *
- * all chemical components with the word "ion" in their name, Sep 2016
- *
- * SET SESSION group_concat_max_len = 1000000;
- * SELECT GROUP_CONCAT(id_ ORDER BY id_ ASC SEPARATOR '", "') from
- * (
- *     SELECT count(obj_id) as c, id_
- *     FROM pdb.chem_comp WHERE name LIKE "% ION%"
- *     GROUP BY id_
- * ) AS t1;
- */
-export const IonNames = new Set([
-    '118', '119', '1AL', '1CU', '2FK', '2HP', '2OF', '3CO',
-    '3MT', '3NI', '3OF', '3P8', '4MO', '4PU', '543', '6MO', 'ACT', 'AG', 'AL',
-    'ALF', 'AM', 'ATH', 'AU', 'AU3', 'AUC', 'AZI', 'BA', 'BCT', 'BEF', 'BF4', 'BO4',
-    'BR', 'BS3', 'BSY', 'CA', 'CAC', 'CD', 'CD1', 'CD3', 'CD5', 'CE', 'CHT', 'CL',
-    'CO', 'CO3', 'CO5', 'CON', 'CR', 'CS', 'CSB', 'CU', 'CU1', 'CU3', 'CUA', 'CUZ',
-    'CYN', 'DME', 'DMI', 'DSC', 'DTI', 'DY', 'E4N', 'EDR', 'EMC', 'ER3', 'EU',
-    'EU3', 'F', 'FE', 'FE2', 'FPO', 'GA', 'GD3', 'GEP', 'HAI', 'HG', 'HGC', 'IN',
-    'IOD', 'IR', 'IR3', 'IRI', 'IUM', 'K', 'KO4', 'LA', 'LCO', 'LCP', 'LI', 'LU',
-    'MAC', 'MG', 'MH2', 'MH3', 'MLI', 'MLT', 'MMC', 'MN', 'MN3', 'MN5', 'MN6',
-    'MO1', 'MO2', 'MO3', 'MO4', 'MO5', 'MO6', 'MOO', 'MOS', 'MOW', 'MW1', 'MW2',
-    'MW3', 'NA', 'NA2', 'NA5', 'NA6', 'NAO', 'NAW', 'NCO', 'NET', 'NH4', 'NI',
-    'NI1', 'NI2', 'NI3', 'NO2', 'NO3', 'NRU', 'O4M', 'OAA', 'OC1', 'OC2', 'OC3',
-    'OC4', 'OC5', 'OC6', 'OC7', 'OC8', 'OCL', 'OCM', 'OCN', 'OCO', 'OF1', 'OF2',
-    'OF3', 'OH', 'OS', 'OS4', 'OXL', 'PB', 'PBM', 'PD', 'PDV', 'PER', 'PI', 'PO3',
-    'PO4', 'PR', 'PT', 'PT4', 'PTN', 'RB', 'RH3', 'RHD', 'RU', 'SB', 'SCN', 'SE4',
-    'SEK', 'SM', 'SMO', 'SO3', 'SO4', 'SR', 'T1A', 'TB', 'TBA', 'TCN', 'TEA', 'TH',
-    'THE', 'TL', 'TMA', 'TRA', 'UNX', 'V', 'VN3', 'VO4', 'W', 'WO5', 'Y1', 'YB',
-    'YB2', 'YH', 'YT3', 'ZCM', 'ZN', 'ZN2', 'ZN3', 'ZNO', 'ZO3',
-    // additional ion names
-    'OHX'
-]);
-
 export type SecondaryStructureType = BitFlags<SecondaryStructureType.Flag>
 export namespace SecondaryStructureType {
     export const is: (ss: SecondaryStructureType, f: Flag) => boolean = BitFlags.has;

+ 9 - 0
src/mol-model/structure/model/types/ions.ts

@@ -0,0 +1,9 @@
+/**
+* Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+*
+* Code-generated ion names params file. Names extracted from CCD components.
+*
+* @author molstar/chem-comp-dict/create-table cli
+*/
+
+export const IonNames = new Set(['118', '119', '543', '1AL', '1CU', '2FK', '2HP', '2OF', '3CO', '3MT', '3NI', '3OF', '3P8', '4MO', '4PU', '4TI', '6MO', 'ACT', 'AG', 'AL', 'ALF', 'AM', 'ATH', 'AU', 'AU3', 'AUC', 'AZI', 'BA', 'BCT', 'BEF', 'BF4', 'BO4', 'BR', 'BS3', 'BSY', 'CA', 'CAC', 'CD', 'CD1', 'CD3', 'CD5', 'CE', 'CF', 'CHT', 'CL', 'CO', 'CO3', 'CO5', 'CON', 'CR', 'CS', 'CSB', 'CU', 'CU1', 'CU3', 'CUA', 'CUZ', 'CYN', 'DME', 'DMI', 'DSC', 'DTI', 'DY', 'E4N', 'EDR', 'EMC', 'ER3', 'EU', 'EU3', 'F', 'FE', 'FE2', 'FPO', 'GA', 'GD3', 'GEP', 'HAI', 'HG', 'HGC', 'IN', 'IOD', 'IR', 'IR3', 'IRI', 'IUM', 'K', 'KO4', 'LA', 'LCO', 'LCP', 'LI', 'LU', 'MAC', 'MG', 'MH2', 'MH3', 'MLI', 'MMC', 'MN', 'MN3', 'MN5', 'MN6', 'MO1', 'MO2', 'MO3', 'MO4', 'MO5', 'MO6', 'MOO', 'MOS', 'MOW', 'MW1', 'MW2', 'MW3', 'NA', 'NA2', 'NA5', 'NA6', 'NAO', 'NAW', 'NET', 'NH4', 'NI', 'NI1', 'NI2', 'NI3', 'NO2', 'NO3', 'NRU', 'O4M', 'OAA', 'OC1', 'OC2', 'OC3', 'OC4', 'OC5', 'OC6', 'OC7', 'OC8', 'OCL', 'OCM', 'OCN', 'OCO', 'OF1', 'OF2', 'OF3', 'OH', 'OS', 'OS4', 'OXL', 'PB', 'PBM', 'PD', 'PDV', 'PER', 'PI', 'PO3', 'PO4', 'PR', 'PT', 'PT4', 'PTN', 'RB', 'RH3', 'RHD', 'RU', 'SB', 'SCN', 'SE4', 'SEK', 'SM', 'SMO', 'SO3', 'SO4', 'SR', 'T1A', 'TB', 'TBA', 'TCN', 'TEA', 'TH', 'THE', 'TL', 'TMA', 'TRA', 'UNX', 'V', 'VN3', 'VO4', 'W', 'WO5', 'Y1', 'YB', 'YB2', 'YH', 'YT3', 'ZCM', 'ZN', 'ZN2', 'ZN3', 'ZNO', 'ZO3', 'ZR', 'NCO', 'OHX']);