浏览代码

improve saccharide detection

Alexander Rose 3 年之前
父节点
当前提交
8723ca38b4

+ 1 - 0
CHANGELOG.md

@@ -7,6 +7,7 @@ Note that since we don't clearly distinguish between a public and private interf
 ## [Unreleased]
 
 - Fix handling of mmcif with empty ``label_*`` fields
+- Improve saccharide detection (compare against list from CCD)
 
 ## [v3.3.1] - 2022-02-27
 

+ 3 - 0
README.md

@@ -120,6 +120,9 @@ and navigate to `build/viewer`
 
     node --max-old-space-size=4096 lib/commonjs/cli/chem-comp-dict/create-ions.js src/mol-model/structure/model/types/ions.ts
 
+**Saccharide names**
+
+    node --max-old-space-size=4096 lib/commonjs/cli/chem-comp-dict/create-saccharides.js src/mol-model/structure/model/types/saccharides.ts
 
 **GraphQL schemas**
 

+ 77 - 0
src/cli/chem-comp-dict/create-saccharides.ts

@@ -0,0 +1,77 @@
+#!/usr/bin/env node
+/**
+ * Copyright (c) 2022 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import * as argparse from 'argparse';
+import * as path from 'path';
+import util from 'util';
+import fs from 'fs';
+require('util.promisify').shim();
+const writeFile = util.promisify(fs.writeFile);
+
+import { DatabaseCollection } from '../../mol-data/db';
+import { CCD_Schema } from '../../mol-io/reader/cif/schema/ccd';
+import { ensureDataAvailable, readCCD } from './util';
+
+function extractSaccharideNames(ccd: DatabaseCollection<CCD_Schema>) {
+    const saccharideNames: string[] = [];
+    for (const k in ccd) {
+        const { chem_comp } = ccd[k];
+        const type = chem_comp.type.value(0).toUpperCase();
+        if (type.includes('SACCHARIDE')) {
+            saccharideNames.push(chem_comp.id.value(0));
+        }
+    }
+    // these are extra saccharides that don't have SACCHARIDE in their type
+    saccharideNames.push(
+        'UMQ', // UNDECYL-MALTOSIDE, via GlyFinder
+        'SQD', // SULFOQUINOVOSYLDIACYLGLYCEROL, via GlyFinder
+    );
+    return saccharideNames;
+}
+
+function writeSaccharideNamesFile(filePath: string, ionNames: string[]) {
+    const output = `/**
+ * Copyright (c) 2022 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * Code-generated ion names params file. Names extracted from CCD components.
+ *
+ * @author molstar/cli/chem-comp-dict/create-saccharides
+ */
+
+export const SaccharideNames = new Set(${JSON.stringify(ionNames).replace(/"/g, "'").replace(/,/g, ', ')});
+`;
+    writeFile(filePath, output);
+}
+
+async function run(out: string, forceDownload = false) {
+    await ensureDataAvailable(forceDownload);
+    const ccd = await readCCD();
+    const saccharideNames = extractSaccharideNames(ccd);
+    if (!fs.existsSync(path.dirname(out))) {
+        fs.mkdirSync(path.dirname(out));
+    }
+    writeSaccharideNamesFile(out, saccharideNames);
+}
+
+const parser = new argparse.ArgumentParser({
+    add_help: true,
+    description: 'Extract and save SaccharideNames from CCD.'
+});
+parser.add_argument('out', {
+    help: 'Generated file output path.'
+});
+parser.add_argument('--forceDownload', '-f', {
+    action: 'store_true',
+    help: 'Force download of CCD and PVCD.'
+});
+interface Args {
+    out: string,
+    forceDownload?: boolean,
+}
+const args: Args = parser.parse_args();
+
+run(args.out, args.forceDownload);

+ 4 - 1
src/mol-model-formats/structure/common/component.ts

@@ -9,6 +9,7 @@ import { WaterNames, PolymerNames } from '../../../mol-model/structure/model/typ
 import { SetUtils } from '../../../mol-util/set';
 import { BasicSchema } from '../basic/schema';
 import { mmCIF_chemComp_schema } from '../../../mol-io/reader/cif/schema/mmcif-extras';
+import { SaccharideCompIdMap } from '../../../mol-model/structure/structure/carbohydrates/constants';
 
 type Component = Table.Row<Pick<mmCIF_chemComp_schema, 'id' | 'name' | 'type'>>
 
@@ -30,7 +31,7 @@ const DnaAtomIdsList = [
 
 /** Used to reduce false positives for atom name-based type guessing */
 const NonPolymerNames = new Set([
-    'FMN', 'NCN', 'FNS', 'FMA', 'ATP', 'ADP', 'AMP', 'GTP', 'GDP', 'GMP' // Mononucleotides
+    'FMN', 'NCN', 'FNS', 'FMA', 'ATP', 'ADP', 'AMP', 'GTP', 'GDP', 'GMP', // Mononucleotides
 ]);
 
 const StandardComponents = (function () {
@@ -158,6 +159,8 @@ export class ComponentBuilder {
                 this.set({ id: compId, name: 'WATER', type: 'non-polymer' });
             } else if (NonPolymerNames.has(compId.toUpperCase())) {
                 this.set({ id: compId, name: this.namesMap.get(compId) || compId, type: 'non-polymer' });
+            } else if (SaccharideCompIdMap.has(compId.toUpperCase())) {
+                this.set({ id: compId, name: this.namesMap.get(compId) || compId, type: 'saccharide' });
             } else {
                 const atomIds = this.getAtomIds(index);
                 if (atomIds.size === 1 && CharmmIonComponents.has(compId)) {

文件差异内容过多而无法显示
+ 8 - 0
src/mol-model/structure/model/types/saccharides.ts


+ 5 - 11
src/mol-model/structure/structure/carbohydrates/constants.ts

@@ -1,11 +1,12 @@
 /**
- * Copyright (c) 2018-2021 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ * Copyright (c) 2018-2022 mol* contributors, licensed under MIT, See LICENSE file for more info.
  *
  * @author Alexander Rose <alexander.rose@weirdbyte.de>
  * @author David Sehnal <david.sehnal@gmail.com>
  */
 
 import { Color, ColorMap } from '../../../../mol-util/color';
+import { SaccharideNames } from '../../model/types/saccharides';
 
 // follows community standard from https://www.ncbi.nlm.nih.gov/glycans/snfg.html
 
@@ -302,13 +303,6 @@ const CommonSaccharideNames: { [k: string]: string[] } = {
     Psi: ['PSV', 'SF6', 'SF9', 'TTV'],
 };
 
-const UnknownSaccharideNames = [
-    'NGZ', // via CCD
-    'LAT', // BETA-LACTOSE, Gal-Glc di-saccharide via GlyFinder
-
-    'PUF', 'GDA', '9WJ', // via updated CCD
-];
-
 /**
  * From http://glycam.org/docs/othertoolsservice/2016/06/09/3d-snfg-list-of-residue-names/#CHARMM
  */
@@ -354,9 +348,9 @@ export const SaccharideCompIdMap = (function () {
             }
         }
     }
-    for (let i = 0, il = UnknownSaccharideNames.length; i < il; ++i) {
-        map.set(UnknownSaccharideNames[i], UnknownSaccharideComponent);
-    }
+    SaccharideNames.forEach(name => {
+        if (!map.has(name)) map.set(name, UnknownSaccharideComponent);
+    });
     return map;
 })();
 

部分文件因为文件数量过多而无法显示