Browse Source

improve pdb parsing

- handle non unique atom and chain names
- fixes #156
Alexander Rose 3 years ago
parent
commit
8bd4221a85
2 changed files with 77 additions and 6 deletions
  1. 1 0
      CHANGELOG.md
  2. 76 6
      src/mol-model-formats/structure/pdb/atom-site.ts

+ 1 - 0
CHANGELOG.md

@@ -9,6 +9,7 @@ Note that since we don't clearly distinguish between a public and private interf
 - Check that model and coordinates have same element count when creating a trajectory
 - Fix aromatic rings assignment: do not mix flags and planarity test
 - Improve bonds assignment of coarse grained models: check for IndexPairBonds and exhaustive StructConn
+- Improve pdb parsing: handle non unique atom and chain names (fixes #156)
 
 ## [v3.5.0] - 2022-03-25
 

+ 76 - 6
src/mol-model-formats/structure/pdb/atom-site.ts

@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2021 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ * Copyright (c) 2019-2022 mol* contributors, licensed under MIT, See LICENSE file for more info.
  *
  * @author David Sehnal <david.sehnal@gmail.com>
  * @author Alexander Rose <alexander.rose@weirdbyte.de>
@@ -40,26 +40,96 @@ export function getAtomSiteTemplate(data: string, count: number) {
 }
 
 export function getAtomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site'] | 'partial_charge']?: CifField } {
+    const pdbx_PDB_model_num = CifField.ofStrings(sites.pdbx_PDB_model_num);
     const auth_asym_id = CifField.ofTokens(sites.auth_asym_id);
+    const auth_seq_id = CifField.ofTokens(sites.auth_seq_id);
     const auth_atom_id = CifField.ofTokens(sites.auth_atom_id);
     const auth_comp_id = CifField.ofTokens(sites.auth_comp_id);
+    const id = CifField.ofStrings(sites.id);
+
+    //
+
+    let currModelNum = pdbx_PDB_model_num.str(0);
+    let currAsymId = auth_asym_id.str(0);
+    let currSeqId = auth_seq_id.int(0);
+    let currLabelAsymId = currAsymId;
+
+    const asymIdCounts = new Map<string, number>();
+    const atomIdCounts = new Map<string, number>();
+
+    const labelAsymIds: string[] = [];
+    const labelAtomIds: string[] = [];
+
+    // ensure unique asym ids per model and unique atom ids per seq id
+    for (let i = 0, il = id.rowCount; i < il; ++i) {
+        const modelNum = pdbx_PDB_model_num.str(i);
+        const asymId = auth_asym_id.str(i);
+        const seqId = auth_seq_id.int(i);
+        let atomId = auth_atom_id.str(i);
+
+        let asymIdChanged = false;
+
+        if (modelNum !== currModelNum) {
+            asymIdCounts.clear();
+            atomIdCounts.clear();
+            currModelNum = modelNum;
+            currAsymId = asymId;
+            currSeqId = seqId;
+            asymIdChanged = true;
+            currLabelAsymId = asymId;
+        } else if (currAsymId !== asymId) {
+            atomIdCounts.clear();
+            currAsymId = asymId;
+            currSeqId = seqId;
+            asymIdChanged = true;
+            currLabelAsymId = asymId;
+        } else if (currSeqId !== seqId) {
+            atomIdCounts.clear();
+            currSeqId = seqId;
+        }
+
+        if (asymIdCounts.has(asymId)) {
+            if (asymIdChanged) {
+                const asymIdCount = asymIdCounts.get(asymId)! + 1;
+                asymIdCounts.set(asymId, asymIdCount);
+                currLabelAsymId = `${asymId}_${asymIdCount}`;
+            }
+        } else {
+            asymIdCounts.set(asymId, 0);
+        }
+        labelAsymIds[i] = currLabelAsymId;
+
+        if (atomIdCounts.has(atomId)) {
+            const atomIdCount = atomIdCounts.get(atomId)! + 1;
+            atomIdCounts.set(atomId, atomIdCount);
+            atomId = `${atomId}_${atomIdCount}`;
+        } else {
+            atomIdCounts.set(atomId, 0);
+        }
+        labelAtomIds[i] = atomId;
+    }
+
+    const labelAsymId = Column.ofStringArray(labelAsymIds);
+    const labelAtomId = Column.ofStringArray(labelAtomIds);
+
+    //
 
     return {
         auth_asym_id,
         auth_atom_id,
         auth_comp_id,
-        auth_seq_id: CifField.ofTokens(sites.auth_seq_id),
+        auth_seq_id,
         B_iso_or_equiv: CifField.ofTokens(sites.B_iso_or_equiv),
         Cartn_x: CifField.ofTokens(sites.Cartn_x),
         Cartn_y: CifField.ofTokens(sites.Cartn_y),
         Cartn_z: CifField.ofTokens(sites.Cartn_z),
         group_PDB: CifField.ofTokens(sites.group_PDB),
-        id: CifField.ofStrings(sites.id),
+        id,
 
         label_alt_id: CifField.ofTokens(sites.label_alt_id),
 
-        label_asym_id: auth_asym_id,
-        label_atom_id: auth_atom_id,
+        label_asym_id: CifField.ofColumn(labelAsymId),
+        label_atom_id: CifField.ofColumn(labelAtomId),
         label_comp_id: auth_comp_id,
         label_seq_id: CifField.ofUndefined(sites.index, Column.Schema.int),
         label_entity_id: CifField.ofStrings(sites.label_entity_id),
@@ -68,7 +138,7 @@ export function getAtomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema
         type_symbol: CifField.ofTokens(sites.type_symbol),
 
         pdbx_PDB_ins_code: CifField.ofTokens(sites.pdbx_PDB_ins_code),
-        pdbx_PDB_model_num: CifField.ofStrings(sites.pdbx_PDB_model_num),
+        pdbx_PDB_model_num,
 
         partial_charge: CifField.ofTokens(sites.partial_charge)
     };