Browse Source

improved pdb reader, entity

Alexander Rose 5 years ago
parent
commit
e57a19857f

+ 7 - 2
src/mol-model-formats/structure/common/component.ts

@@ -73,6 +73,7 @@ const StandardComponents = (function() {
 })()
 
 export class ComponentBuilder {
+    private namesMap = new Map<string, string>()
     private comps = new Map<string, Component>()
     private ids: string[] = []
     private names: string[] = []
@@ -129,8 +130,8 @@ export class ComponentBuilder {
             } else if (WaterNames.has(compId)) {
                 this.set({ id: compId, name: 'WATER', type: 'non-polymer' })
             } else {
-                const atomIds = this.getAtomIds(index)
-                this.set({ id: compId, name: compId, type: this.getType(atomIds) })
+                const type = this.getType(this.getAtomIds(index))
+                this.set({ id: compId, name: this.namesMap.get(compId) || compId, type })
             }
         }
         return this.get(compId)!
@@ -145,6 +146,10 @@ export class ComponentBuilder {
         return CifCategory.ofFields('chem_comp', chemComp)
     }
 
+    setNames(names: [string, string][]) {
+        names.forEach(n => this.namesMap.set(n[0], n[1]))
+    }
+
     constructor(private seqId: Column<number>, private atomId: Column<string>) {
 
     }

+ 80 - 0
src/mol-model-formats/structure/common/entity.ts

@@ -0,0 +1,80 @@
+/**
+ * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { CifCategory, CifField } from '../../../mol-io/reader/cif';
+import { MoleculeType, isPolymer } from '../../../mol-model/structure/model/types';
+import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif';
+
+export type EntityCompound = { chains: string[], description: string }
+
+export class EntityBuilder {
+    private count = 0
+    private ids: string[] = []
+    private types: string[] = []
+    private descriptions: string[] = []
+
+    private compoundsMap = new Map<string, string>()
+    private namesMap = new Map<string, string>()
+    private heteroMap = new Map<string, string>()
+    private chainMap = new Map<string, string>()
+    private waterId?: string
+
+    private set(type: string, description: string) {
+        this.count += 1
+        this.ids.push(`${this.count}`)
+        this.types.push(type)
+        this.descriptions.push(description)
+    }
+
+    getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string {
+        if (moleculeType === MoleculeType.water) {
+            if (this.waterId === undefined) {
+                this.set('water', 'Water')
+                this.waterId = `${this.count}`
+            }
+            return this.waterId;
+        } else if (isPolymer(moleculeType)) {
+            if (this.compoundsMap.has(chainId)) {
+                return this.compoundsMap.get(chainId)!
+            } else {
+                if (!this.chainMap.has(chainId)) {
+                    this.set('polymer', `Polymer ${this.chainMap.size + 1}`)
+                    this.chainMap.set(chainId, `${this.count}`)
+                }
+                return this.chainMap.get(chainId)!
+            }
+        } else {
+            if (!this.heteroMap.has(compId)) {
+                this.set('non-polymer', this.namesMap.get(compId) || compId)
+                this.heteroMap.set(compId, `${this.count}`)
+            }
+            return this.heteroMap.get(compId)!
+        }
+    }
+
+    getEntityCategory() {
+        const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = {
+            id: CifField.ofStrings(this.ids),
+            type: CifField.ofStrings(this.types),
+            pdbx_description: CifField.ofStrings(this.descriptions),
+        }
+        return CifCategory.ofFields('entity', entity)
+    }
+
+    setCompounds(compounds: EntityCompound[]) {
+        for (let i = 0, il = compounds.length; i < il; ++i) {
+            const { chains, description } = compounds[i]
+            this.set('polymer', description)
+            for (let j = 0, jl = chains.length; j < jl; ++j) {
+                this.compoundsMap.set(chains[j], `${this.count}`)
+            }
+        }
+    }
+
+    setNames(names: [string, string][]) {
+        names.forEach(n => this.namesMap.set(n[0], n[1]))
+    }
+}

+ 2 - 50
src/mol-model-formats/structure/gro.ts

@@ -13,61 +13,13 @@ import { CifCategory, CifField } from '../../mol-io/reader/cif';
 import { Column } from '../../mol-data/db';
 import { mmCIF_Schema } from '../../mol-io/reader/cif/schema/mmcif';
 import { guessElementSymbolString } from './util';
-import { MoleculeType, getMoleculeType, isPolymer } from '../../mol-model/structure/model/types';
+import { MoleculeType, getMoleculeType } from '../../mol-model/structure/model/types';
 import { ComponentBuilder } from './common/component';
 import { getChainId } from './common/util';
+import { EntityBuilder } from './common/entity';
 
 // TODO multi model files
 
-class EntityBuilder {
-    private count = 0
-    private ids: string[] = []
-    private types: string[] = []
-    private descriptions: string[] = []
-
-    private heteroMap = new Map<string, string>()
-    private chainMap = new Map<string, string>()
-    private waterId?: string
-
-    private set(type: string, description: string) {
-        this.count += 1
-        this.ids.push(`${this.count}`)
-        this.types.push(type)
-        this.descriptions.push(description)
-    }
-
-    getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string {
-        if (moleculeType === MoleculeType.water) {
-            if (this.waterId === undefined) {
-                this.set('water', 'Water')
-                this.waterId = `${this.count}`
-            }
-            return this.waterId;
-        } else if (isPolymer(moleculeType)) {
-            if (!this.chainMap.has(chainId)) {
-                this.set('polymer', `Polymer ${this.chainMap.size + 1}`)
-                this.chainMap.set(chainId, `${this.count}`)
-            }
-            return this.chainMap.get(chainId)!
-        } else {
-            if (!this.heteroMap.has(compId)) {
-                this.set('non-polymer', compId)
-                this.heteroMap.set(compId, `${this.count}`)
-            }
-            return this.heteroMap.get(compId)!
-        }
-    }
-
-    getEntityCategory() {
-        const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = {
-            id: CifField.ofStrings(this.ids),
-            type: CifField.ofStrings(this.types),
-            pdbx_description: CifField.ofStrings(this.descriptions),
-        }
-        return CifCategory.ofFields('entity', entity)
-    }
-}
-
 function getCategories(atoms: GroAtoms) {
     const auth_atom_id = CifField.ofColumn(atoms.atomName)
     const auth_comp_id = CifField.ofColumn(atoms.residueName)

+ 26 - 66
src/mol-model-formats/structure/pdb/entity.ts

@@ -5,8 +5,7 @@
  */
 
 import { Tokens } from '../../../mol-io/reader/common/text/tokenizer';
-import { CifCategory, CifField } from '../../../mol-io/reader/cif';
-import { WaterNames } from '../../../mol-model/structure/model/types';
+import { EntityCompound } from '../common/entity';
 
 const Spec = {
     'MOL_ID': '',
@@ -21,14 +20,12 @@ const Spec = {
 }
 type Spec = keyof typeof Spec
 
-type Compound = { chains: string[], name: string }
-
 export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
     const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1])
 
     let currentSpec: Spec | undefined
-    let currentCompound: Compound = { chains: [], name: '' }
-    const Compounds: Compound[] = []
+    let currentCompound: EntityCompound = { chains: [], description: '' }
+    const Compounds: EntityCompound[] = []
 
     for (let i = lineStart; i < lineEnd; i++) {
         let line = getLine(i)
@@ -56,12 +53,12 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
         if (currentSpec === 'MOL_ID') {
             currentCompound = {
                 chains: [],
-                name: ''
+                description: ''
             }
             Compounds.push(currentCompound)
         } else if (currentSpec === 'MOLECULE') {
-            if (currentCompound.name) currentCompound.name += ' '
-            currentCompound.name += value
+            if (currentCompound.description) currentCompound.description += ' '
+            currentCompound.description += value
         } else if (currentSpec === 'CHAIN') {
             Array.prototype.push.apply(currentCompound.chains, value.split(/\s*,\s*/))
         }
@@ -70,66 +67,29 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
     return Compounds
 }
 
-export class EntityBuilder {
-    private count = 0
-    private ids: string[] = []
-    private types: string[] = []
-    private descriptions: string[] = []
-
-    private compoundsMap = new Map<string, string>()
-    private heteroMap = new Map<string, string>()
-    private chainMap = new Map<string, string>()
-    private waterId?: string
-
-    private set(type: string, description: string) {
-        this.count += 1
-        this.ids.push(`${this.count}`)
-        this.types.push(type)
-        this.descriptions.push(description)
-    }
+export function parseHetnam(lines: Tokens, lineStart: number, lineEnd: number) {
+    const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1])
 
-    getEntityId(residueName: string, chainId: string, isHet: boolean): string {
-        if (isHet) {
-            if (WaterNames.has(residueName)) {
-                if (this.waterId === undefined) {
-                    this.set('water', 'Water')
-                    this.waterId = `${this.count}`
-                }
-                return this.waterId;
-            } else {
-                if (!this.heteroMap.has(residueName)) {
-                    this.set('non-polymer', residueName)
-                    this.heteroMap.set(residueName, `${this.count}`)
-                }
-                return this.heteroMap.get(residueName)!
-            }
-        } else if (this.compoundsMap.has(chainId)) {
-            return this.compoundsMap.get(chainId)!
-        } else {
-            if (!this.chainMap.has(chainId)) {
-                this.set('polymer', chainId)
-                this.chainMap.set(chainId, `${this.count}`)
-            }
-            return this.chainMap.get(chainId)!
-        }
-    }
+    const hetnams = new Map<string, string>()
 
-    getEntityCategory() {
-        const entity = {
-            id: CifField.ofStrings(this.ids),
-            type: CifField.ofStrings(this.types),
-            pdbx_description: CifField.ofStrings(this.descriptions)
+    for (let i = lineStart; i < lineEnd; i++) {
+        let line = getLine(i)
+        // COLUMNS       DATA  TYPE    FIELD           DEFINITION
+        // ----------------------------------------------------------------------------
+        //  1 -  6       Record name   "HETNAM"
+        //  9 - 10       Continuation  continuation    Allows concatenation of multiple records.
+        // 12 - 14       LString(3)    hetID           Het identifier, right-justified.
+        // 16 - 70       String        text            Chemical name.
+
+        const het = line.substr(11, 3).trim()
+        const name = line.substr(15).trim()
+
+        if (hetnams.has(het)) {
+            hetnams.set(het, `${hetnams.get(het)!} ${name}`)
+        } else {
+            hetnams.set(het, name)
         }
-        return CifCategory.ofFields('entity', entity)
     }
 
-    setCompounds(compounds: Compound[]) {
-        for (let i = 0, il = compounds.length; i < il; ++i) {
-            const { chains, name } = compounds[i]
-            this.set('polymer', name)
-            for (let j = 0, jl = chains.length; j < jl; ++j) {
-                this.compoundsMap.set(chains[j], `${this.count}`)
-            }
-        }
-    }
+    return hetnams
 }

+ 39 - 14
src/mol-model-formats/structure/pdb/to-cif.ts

@@ -13,10 +13,14 @@ import { PdbFile } from '../../../mol-io/reader/pdb/schema';
 import { parseCryst1, parseRemark350, parseMtrix } from './assembly';
 import { parseHelix, parseSheet } from './secondary-structure';
 import { guessElementSymbolTokens } from '../util';
-import { parseCmpnd, EntityBuilder } from './entity';
-
-type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never
-function atom_site_template(data: string, count: number) {
+import { parseCmpnd, parseHetnam } from './entity';
+import { ComponentBuilder } from '../common/component';
+import { EntityBuilder } from '../common/entity';
+import { Column } from '../../../mol-data/db';
+import { getMoleculeType } from '../../../mol-model/structure/model/types';
+
+type AtomSiteTemplate = typeof getAtomSiteTemplate extends (...args: any) => infer T ? T : never
+function getAtomSiteTemplate(data: string, count: number) {
     const str = () => [] as string[];
     const ts = () => TokenBuilder.create(data, 2 * count);
     return {
@@ -41,7 +45,7 @@ function atom_site_template(data: string, count: number) {
     };
 }
 
-function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } {
+function getAomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } {
     const auth_asym_id = CifField.ofTokens(sites.auth_asym_id);
     const auth_atom_id = CifField.ofTokens(sites.auth_atom_id);
     const auth_comp_id = CifField.ofTokens(sites.auth_comp_id);
@@ -75,7 +79,7 @@ function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_s
     };
 }
 
-function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) {
+function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number) {
     const { data: str } = data;
     const length = e - s;
 
@@ -103,11 +107,9 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s
 
     // 18 - 20        Residue name    Residue name.
     TokenBuilder.addToken(sites.auth_comp_id, Tokenizer.trim(data, s + 17, s + 20));
-    const residueName = str.substring(data.tokenStart, data.tokenEnd);
 
     // 22             Character       Chain identifier.
     TokenBuilder.add(sites.auth_asym_id, s + 21, s + 22);
-    const chainId = str.substring(s + 21, s + 22);
 
     // 23 - 26        Integer         Residue sequence number.
     // TODO: support HEX
@@ -155,7 +157,6 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s
         guessElementSymbolTokens(sites.type_symbol, str, s + 12, s + 16)
     }
 
-    sites.label_entity_id[sites.index] = entityBuilder.getEntityId(residueName, chainId, isHet);
     sites.pdbx_PDB_model_num[sites.index] = model;
 
     sites.index++;
@@ -180,9 +181,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
         }
     }
 
-    const atom_site = atom_site_template(data, atomCount);
+    const atomSite = getAtomSiteTemplate(data, atomCount);
     const entityBuilder = new EntityBuilder();
     const helperCategories: CifCategory[] = [];
+    const heteroNames: [string, string][] = [];
 
     let modelNum = 0, modelStr = '';
 
@@ -192,7 +194,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
             case 'A':
                 if (!substringStartsWith(data, s, e, 'ATOM  ')) continue;
                 if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
-                addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, false);
+                addAtom(atomSite, modelStr, tokenizer, s, e);
                 break;
             case 'C':
                 if (substringStartsWith(data, s, e, 'CRYST1')) {
@@ -213,7 +215,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
             case 'H':
                 if (substringStartsWith(data, s, e, 'HETATM')) {
                     if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
-                    addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, true);
+                    addAtom(atomSite, modelStr, tokenizer, s, e);
                 } else if (substringStartsWith(data, s, e, 'HELIX')) {
                     let j = i + 1;
                     while (true) {
@@ -223,8 +225,16 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
                     }
                     helperCategories.push(parseHelix(lines, i, j));
                     i = j - 1;
+                } else if (substringStartsWith(data, s, e, 'HETNAM')) {
+                    let j = i + 1;
+                    while (true) {
+                        s = indices[2 * j]; e = indices[2 * j + 1];
+                        if (!substringStartsWith(data, s, e, 'HETNAM')) break;
+                        j++;
+                    }
+                    heteroNames.push(...Array.from(parseHetnam(lines, i, j).entries()))
+                    i = j - 1;
                 }
-                // TODO: HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids)
                 break;
             case 'M':
                 if (substringStartsWith(data, s, e, 'MODEL ')) {
@@ -274,9 +284,24 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
         }
     }
 
+    // build entity and chem_comp categories
+    const seqIds = Column.ofIntTokens(atomSite.auth_seq_id)
+    const atomIds = Column.ofStringTokens(atomSite.auth_atom_id)
+    const compIds = Column.ofStringTokens(atomSite.auth_comp_id)
+    const asymIds = Column.ofStringTokens(atomSite.auth_asym_id)
+    const componentBuilder = new ComponentBuilder(seqIds, atomIds)
+    componentBuilder.setNames(heteroNames)
+    entityBuilder.setNames(heteroNames)
+    for (let i = 0, il = compIds.rowCount; i < il; ++i) {
+        const compId = compIds.value(i)
+        const moleculeType = getMoleculeType(componentBuilder.add(compId, i).type, compId)
+        atomSite.label_entity_id[i] = entityBuilder.getEntityId(compId, moleculeType, asymIds.value(i))
+    }
+
     const categories = {
         entity: entityBuilder.getEntityCategory(),
-        atom_site: CifCategory.ofFields('atom_site', _atom_site(atom_site))
+        chem_comp: componentBuilder.getChemCompCategory(),
+        atom_site: CifCategory.ofFields('atom_site', getAomSite(atomSite))
     } as any;
 
     for (const c of helperCategories) {