Browse Source

better guess-element-symbol for pdb-to-cif

Alexander Rose 6 years ago
parent
commit
a4b1cdef16

+ 26 - 0
src/mol-model-formats/structure/_spec/pdb.spec.ts

@@ -0,0 +1,26 @@
+/**
+ * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { guessElementSymbol } from '../pdb/to-cif';
+import { TokenBuilder } from 'mol-io/reader/common/text/tokenizer';
+
+const records = [
+    ['ATOM     19 HD23 LEU A   1     151.940 143.340 155.670  0.00  0.00', 'H'],
+    ['ATOM     38  CA  SER A   3     146.430 138.150 162.270  0.00  0.00', 'C'],
+    ['ATOM     38 NA   SER A   3     146.430 138.150 162.270  0.00  0.00', 'NA'],
+    ['ATOM     38  NAA SER A   3     146.430 138.150 162.270  0.00  0.00', 'N'],
+]
+
+describe('PDB to-cif', () => {
+    it('guess-element-symbol', () => {
+        for (let i = 0, il = records.length; i < il; ++i) {
+            const [ data, element ] = records[i]
+            const tokens = TokenBuilder.create(data, 2)
+            guessElementSymbol(tokens, data, 12, 16)
+            expect(data.substring(tokens.indices[0], tokens.indices[1])).toBe(element)
+        }
+    });
+});

+ 40 - 4
src/mol-model-formats/structure/pdb/to-cif.ts

@@ -8,7 +8,7 @@
 import { substringStartsWith } from 'mol-util/string';
 import { CifField, CifCategory, CifFrame } from 'mol-io/reader/cif';
 import { mmCIF_Schema } from 'mol-io/reader/cif/schema/mmcif';
-import { TokenBuilder, Tokenizer } from 'mol-io/reader/common/text/tokenizer';
+import { TokenBuilder, Tokenizer, Tokens } from 'mol-io/reader/common/text/tokenizer';
 import { PdbFile } from 'mol-io/reader/pdb/schema';
 import { parseCryst1, parseRemark350, parseMtrix } from './assembly';
 import { WaterNames } from 'mol-model/structure/model/types';
@@ -89,6 +89,43 @@ function getEntityId(residueName: string, isHet: boolean) {
     return '1';
 }
 
+export function guessElementSymbol(tokens: Tokens, str: string, start: number, end: number) {
+    let s = start, e = end - 1
+
+    // trim spaces and numbers
+    let c = str.charCodeAt(s)
+    while ((c === 32 || (c >= 48 && c <= 57)) && s <= e) c = str.charCodeAt(++s)
+    c = str.charCodeAt(e)
+    while ((c === 32 || (c >= 48 && c <= 57)) && e >= s) c = str.charCodeAt(--e)
+
+    ++e
+
+    if (s === e) return TokenBuilder.add(tokens, s, e) // empty
+    if (s + 1 === e) return TokenBuilder.add(tokens, s, e) // one char
+
+    c = str.charCodeAt(s)
+
+    if (s + 2 === e) { // two chars
+        const c2 = str.charCodeAt(s + 1)
+        if (
+            ((c === 78 || c === 110) && (c2 === 65 || c2 ===  97)) || // NA na Na nA
+            ((c === 67 || c ===  99) && (c2 === 76 || c2 === 108)) || // CL
+            ((c === 70 || c === 102) && (c2 === 69 || c2 === 101))    // FE
+        ) return TokenBuilder.add(tokens, s, s + 2)
+    }
+
+    if (
+        c === 67 || c ===  99 || // C c
+        c === 72 || c === 104 || // H h
+        c === 78 || c === 110 || // N n
+        c === 79 || c === 111 || // O o
+        c === 80 || c === 112 || // P p
+        c === 83 || c === 115    // S s
+    ) return TokenBuilder.add(tokens, s, s + 1)
+
+    TokenBuilder.add(tokens, s, s) // no reasonable guess, add empty token
+}
+
 function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) {
     const { data: str } = data;
     const length = e - s;
@@ -162,11 +199,10 @@ function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: num
         if (data.tokenStart < data.tokenEnd) {
             TokenBuilder.addToken(sites.type_symbol, data);
         } else {
-            // "guess" the symbol
-            TokenBuilder.add(sites.type_symbol, s + 12, s + 13);
+            guessElementSymbol(sites.type_symbol, str, s + 12, s + 16)
         }
     } else {
-        TokenBuilder.add(sites.type_symbol, s + 12, s + 13);
+        guessElementSymbol(sites.type_symbol, str, s + 12, s + 16)
     }
 
     sites.label_entity_id[sites.index] = getEntityId(residueName, isHet);

+ 1 - 1
src/mol-model/structure/model/properties/utils/guess-element.ts

@@ -12,7 +12,7 @@ function charAtIsNumber(str: string, index: number) {
     return code >= 48 && code <= 57
 }
 
-export function guessElement (str: string) {
+export function guessElement(str: string) {
     let at = str.trim().toUpperCase()
 
     if (charAtIsNumber(at, 0)) at = at.substr(1)