Browse Source

parse pdb conect records

- heuristic to test if list of bonds is exhaustive to skip auto-bonding
Alexander Rose 4 years ago
parent
commit
98b118fd1e

+ 89 - 0
src/mol-model-formats/structure/pdb/conect.ts

@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) 2021 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { CifCategory, CifField } from '../../../mol-io/reader/cif';
+import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif';
+import { Tokens } from '../../../mol-io/reader/common/text/tokenizer';
+
+export function parseConect(lines: Tokens, lineStart: number, lineEnd: number, sites: { [K in keyof mmCIF_Schema['atom_site']]?: CifField }): CifCategory {
+    const idMap: { [k: string]: number } = {};
+    for (let i = 0, il = sites.id!.rowCount; i < il; ++i) {
+        idMap[sites.id!.str(i)] = i;
+    }
+
+    const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1]);
+
+    const id: string[] = [];
+    const conn_type_id: string[] = [];
+
+    const ptnr1_label_asym_id: string[] = [];
+    const ptnr1_label_seq_id: number[] = [];
+    const ptnr1_auth_seq_id: number[] = [];
+    const ptnr1_label_atom_id: string[] = [];
+
+    const ptnr2_label_asym_id: string[] = [];
+    const ptnr2_label_seq_id: number[] = [];
+    const ptnr2_auth_seq_id: number[] = [];
+    const ptnr2_label_atom_id: string[] = [];
+
+    const pos = [11, 16, 21, 26];
+
+    let k = 1;
+
+    for (let i = lineStart; i < lineEnd; i++) {
+        const line = getLine(i);
+        const idxA = idMap[parseInt(line.substr(6, 5))];
+
+        const bondIndex: {[k: number]: number} = {};
+
+        if (idxA === undefined) continue;
+
+        for (let j = 0; j < 4; ++j) {
+            const idB = parseInt(line.substr(pos[j], 5));
+            if (Number.isNaN(idB)) continue;
+
+            const idxB = idMap[idB];
+            if (idxB === undefined) continue;
+            if (idxA > idxB) continue;
+
+            // TODO: interpret records where a 'idxB' atom is given multiple times
+            // as double/triple bonds, e.g. CONECT 1529 1528 1528 is a double bond
+            if (bondIndex[idxB] !== undefined) continue;
+
+            id.push(`covale${k}`);
+            conn_type_id.push('covale');
+
+            ptnr1_label_asym_id.push(sites.label_asym_id!.str(idxA));
+            ptnr1_auth_seq_id.push(sites.auth_seq_id!.int(idxA));
+            ptnr1_label_seq_id.push(sites.label_seq_id!.int(idxA));
+            ptnr1_label_atom_id.push(sites.label_atom_id!.str(idxA));
+
+            ptnr2_label_asym_id.push(sites.label_asym_id!.str(idxB));
+            ptnr2_auth_seq_id.push(sites.auth_seq_id!.int(idxB));
+            ptnr2_label_seq_id.push(sites.label_seq_id!.int(idxB));
+            ptnr2_label_atom_id.push(sites.label_atom_id!.str(idxB));
+
+            k += 1;
+        }
+    }
+
+    const struct_conn: Partial<CifCategory.Fields<mmCIF_Schema['struct_conn']>> = {
+        id: CifField.ofStrings(id),
+        conn_type_id: CifField.ofStrings(conn_type_id),
+
+        ptnr1_label_asym_id: CifField.ofStrings(ptnr1_label_asym_id),
+        ptnr1_auth_seq_id: CifField.ofNumbers(ptnr1_auth_seq_id),
+        ptnr1_label_seq_id: CifField.ofNumbers(ptnr1_label_seq_id),
+        ptnr1_label_atom_id: CifField.ofStrings(ptnr1_label_atom_id),
+
+        ptnr2_label_asym_id: CifField.ofStrings(ptnr2_label_asym_id),
+        ptnr2_label_seq_id: CifField.ofNumbers(ptnr2_label_seq_id),
+        ptnr2_auth_seq_id: CifField.ofNumbers(ptnr2_auth_seq_id),
+        ptnr2_label_atom_id: CifField.ofStrings(ptnr2_label_atom_id),
+    };
+
+    return CifCategory.ofFields('struct_conn', struct_conn);
+}

+ 23 - 3
src/mol-model-formats/structure/pdb/to-cif.ts

@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ * Copyright (c) 2019-2021 mol* contributors, licensed under MIT, See LICENSE file for more info.
  *
  * @author David Sehnal <david.sehnal@gmail.com>
  * @author Alexander Rose <alexander.rose@weirdbyte.de>
@@ -18,6 +18,8 @@ import { Column } from '../../../mol-data/db';
 import { getMoleculeType } from '../../../mol-model/structure/model/types';
 import { getAtomSiteTemplate, addAtom, getAtomSite } from './atom-site';
 import { addAnisotropic, getAnisotropicTemplate, getAnisotropic } from './anisotropic';
+import { parseConect } from './conect';
+import { isDebugMode } from '../../../mol-util/debug';
 
 export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
     const { lines } = pdb;
@@ -48,6 +50,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
     const heteroNames: [string, string][] = [];
 
     let modelNum = 0, modelStr = '';
+    let conectRange: [number, number] | undefined = undefined;
 
     for (let i = 0, _i = lines.count; i < _i; i++) {
         let s = indices[2 * i], e = indices[2 * i + 1];
@@ -63,8 +66,21 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
             case 'C':
                 if (substringStartsWith(data, s, e, 'CRYST1')) {
                     helperCategories.push(...parseCryst1(pdb.id || '?', data.substring(s, e)));
-                } else if (substringStartsWith(data, s, e, 'CONNECT')) {
-                    // TODO: CONNECT records => struct_conn
+                } else if (substringStartsWith(data, s, e, 'CONECT')) {
+                    let j = i + 1;
+                    while (true) {
+                        s = indices[2 * j]; e = indices[2 * j + 1];
+                        if (!substringStartsWith(data, s, e, 'CONECT')) break;
+                        j++;
+                    }
+                    if (conectRange) {
+                        if (isDebugMode) {
+                            console.log('only single CONECT block allowed, ignoring others');
+                        }
+                    } else {
+                        conectRange = [i, j];
+                    }
+                    i = j - 1;
                 } else if (substringStartsWith(data, s, e, 'COMPND')) {
                     let j = i + 1;
                     while (true) {
@@ -165,6 +181,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
     const atom_site = getAtomSite(atomSite);
     if (!isPdbqt) delete atom_site.partial_charge;
 
+    if (conectRange) {
+        helperCategories.push(parseConect(lines, conectRange[0], conectRange[1], atom_site));
+    }
+
     const categories = {
         entity: CifCategory.ofTable('entity', entityBuilder.getEntityTable()),
         chem_comp: CifCategory.ofTable('chem_comp', componentBuilder.getChemCompTable()),

+ 10 - 1
src/mol-model-formats/structure/property/bonds/struct_conn.ts

@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2017-2020 Mol* contributors, licensed under MIT, See LICENSE file for more info.
+ * Copyright (c) 2017-2021 Mol* contributors, licensed under MIT, See LICENSE file for more info.
  *
  * @author David Sehnal <david.sehnal@gmail.com>
  * @author Alexander Rose <alexander.rose@weirdbyte.de>
@@ -52,6 +52,15 @@ export namespace StructConn {
 
     export const Provider = FormatPropertyProvider.create<StructConn>(Descriptor);
 
+    /**
+     * Heuristic to test if StructConn likely provides all atomic bonds by
+     * checking if the fraction of bonds and atoms is high (> 0.95).
+     */
+    export function isExhaustive(model: Model) {
+        const structConn = StructConn.Provider.get(model);
+        return structConn && (structConn.data.id.rowCount / model.atomicConformation.atomId.rowCount) > 0.95;
+    }
+
     function hasAtom({ units }: Structure, element: ElementIndex) {
         for (let i = 0, _i = units.length; i < _i; i++) {
             if (SortedArray.indexOf(units[i].elements, element) >= 0) return true;

+ 3 - 0
src/mol-model/structure/structure/unit/bonds/inter-compute.ts

@@ -50,6 +50,8 @@ function findPairBonds(unitA: Unit.Atomic, unitB: Unit.Atomic, props: BondComput
     const structConn = unitA.model === unitB.model && StructConn.Provider.get(unitA.model);
     const indexPairs = unitA.model === unitB.model && IndexPairBonds.Provider.get(unitA.model);
 
+    const structConnExhaustive = unitA.model === unitB.model && StructConn.isExhaustive(unitA.model);
+
     // the lookup queries need to happen in the "unitB space".
     // that means _imageA = inverseOperB(operA(aI))
     const imageTransform = Mat4.mul(_imageTransform, unitB.conformation.operator.inverse, unitA.conformation.operator.matrix);
@@ -103,6 +105,7 @@ function findPairBonds(unitA: Unit.Atomic, unitB: Unit.Atomic, props: BondComput
             // all are given and thus we don't need to compute any other
             if (added) continue;
         }
+        if (structConnExhaustive) continue;
 
         const occA = occupancyA.value(aI);
 

+ 3 - 0
src/mol-model/structure/structure/unit/bonds/intra-compute.ts

@@ -95,6 +95,8 @@ function findBonds(unit: Unit.Atomic, props: BondComputationProps): IntraUnitBon
     const structConn = StructConn.Provider.get(unit.model);
     const component = ComponentBond.Provider.get(unit.model);
 
+    const structConnExhaustive = StructConn.isExhaustive(unit.model);
+
     const atomA: StructureElement.UnitIndex[] = [];
     const atomB: StructureElement.UnitIndex[] = [];
     const flags: number[] = [];
@@ -135,6 +137,7 @@ function findBonds(unit: Unit.Atomic, props: BondComputationProps): IntraUnitBon
                 structConnAdded.add(_bI);
             }
         }
+        if (structConnExhaustive) continue;
 
         const raI = residueIndex[aI];
         const seqIdA = label_seq_id.value(raI);