Browse Source

Merge pull request #288 from jpattle/allow-v3-sdf

Added the ability to handle v3000 sd files
David Sehnal 3 years ago
parent
commit
4318c89bdb

+ 128 - 0
src/mol-io/reader/_spec/sdf.spec.ts

@@ -332,6 +332,80 @@ M  END
 $$$$
 $$$$
 `;
 `;
 
 
+const V3000SdfString = `FYI-001
+FYICenter.com
+123456789012345678901234567890123456789012345678901234567890
+ 0  0  0     0  0            999 V3000
+M  V30 BEGIN CTAB
+M  V30 COUNTS 13 14 0 0 0
+M  V30 BEGIN ATOM
+M  V30 1 N 0.84 -0.16 0 0
+M  V30 2 N 1.48 0.43 0 0
+M  V30 3 N 0.09 0.27 0 0
+M  V30 4 C 1.11 1.21 0 0
+M  V30 5 C 0.27 1.12 0 0
+M  V30 6 C 0.84 -1.03 0 0
+M  V30 7 C 1.53 1.99 0 0
+M  V30 8 Cl 1.07 2.74 0.01 0
+M  V30 9 C 1.59 -1.46 0 0
+M  V30 10 C 0.08 -1.46 0 0
+M  V30 11 C 1.59 -2.33 0 0
+M  V30 12 C 0.07 -2.32 0 0
+M  V30 13 C 0.84 -2.76 0 0
+M  V30 END ATOM
+M  V30 BEGIN BOND
+M  V30 1 1 2 1
+M  V30 2 1 3 1
+M  V30 3 1 6 1
+M  V30 4 2 4 2
+M  V30 5 2 5 3
+M  V30 6 1 7 4
+M  V30 7 1 4 5
+M  V30 8 1 9 6
+M  V30 9 2 10 6
+M  V30 10 1 8 7
+M  V30 11 2 11 9
+M  V30 12 1 12 10
+M  V30 13 1 13 11
+M  V30 14 2 13 12
+M  V30 END BOND
+M  V30 END CTAB
+M  END
+> <Comment>
+This is an SDF example.
+With a multi-line comment.
+
+> <source>
+This was retrieved from biotech.fyicenter.com
+
+$$$$
+L-Alanine
+GSMACCS-II07189510252D 1 0.00366 0.00000 0
+Figure 1, J. Chem. Inf. Comput. Sci., Vol 32, No. 3., 1992
+ 0 0 0 0 0 999 V3000
+M V30 BEGIN CTAB
+M V30 COUNTS 6 5 0 0 1
+M V30 BEGIN ATOM
+M V30 1 C -0.6622 0.5342 0 0 CFG=2
+M V30 2 C 0.6622 -0.3 0 0
+M V30 3 C -0.7207 2.0817 0 0 MASS=13
+M V30 4 N -1.8622 -0.3695 0 0 CHG=1
+M V30 5 O 0.622 -1.8037 0 0
+M V30 6 O 1.9464 0.4244 0 0 CHG=-1
+M V30 END ATOM
+M V30 BEGIN BOND
+M V30 1 1 1 2
+M V30 2 1 1 3 CFG=1
+M V30 3 1 1 4
+M V30 4 2 2 5
+M V30 5 1 2 6
+M V30 END BOND
+M V30 END CTAB
+M END
+
+$$$$
+`;
+
 describe('sdf reader', () => {
 describe('sdf reader', () => {
     it('basic', async () => {
     it('basic', async () => {
         const parsed = await parseSdf(SdfString).run();
         const parsed = await parseSdf(SdfString).run();
@@ -383,4 +457,58 @@ describe('sdf reader', () => {
         expect(compound3.dataItems.dataHeader.value(21)).toBe('<PUBCHEM_COORDINATE_TYPE>');
         expect(compound3.dataItems.dataHeader.value(21)).toBe('<PUBCHEM_COORDINATE_TYPE>');
         expect(compound3.dataItems.data.value(21)).toBe('2\n5\n10');
         expect(compound3.dataItems.data.value(21)).toBe('2\n5\n10');
     });
     });
+
+    it('v3000', async () => {
+        const parsed = await parseSdf(V3000SdfString).run();
+        if (parsed.isError) {
+            throw new Error(parsed.message);
+        }
+
+        expect(parsed.result.compounds.length).toBe(2);
+
+        const compound1 = parsed.result.compounds[0];
+        expect(compound1.molFile.atoms.count).toBe(13);
+        expect(compound1.molFile.atoms.x.rowCount).toBe(13);
+        expect(compound1.molFile.atoms.y.rowCount).toBe(13);
+        expect(compound1.molFile.atoms.z.rowCount).toBe(13);
+        expect(compound1.molFile.atoms.type_symbol.rowCount).toBe(13);
+        expect(compound1.molFile.bonds.count).toBe(14);
+        expect(compound1.molFile.bonds.atomIdxA.rowCount).toBe(14);
+        expect(compound1.molFile.bonds.atomIdxB.rowCount).toBe(14);
+        expect(compound1.molFile.bonds.order.rowCount).toBe(14);
+
+        expect(compound1.molFile.atoms.x.value(7)).toBe(1.07);
+        expect(compound1.molFile.atoms.y.value(7)).toBe(2.74);
+        expect(compound1.molFile.atoms.z.value(7)).toBe(0.01);
+        expect(compound1.molFile.atoms.type_symbol.value(7)).toBe('Cl');
+
+        expect(compound1.molFile.bonds.atomIdxA.value(10)).toBe(11);
+        expect(compound1.molFile.bonds.atomIdxB.value(10)).toBe(9);
+        expect(compound1.molFile.bonds.order.value(10)).toBe(2);
+
+        expect(compound1.dataItems.dataHeader.rowCount).toBe(2);
+        expect(compound1.dataItems.data.rowCount).toBe(2);
+
+        expect(compound1.dataItems.dataHeader.value(0)).toBe('<Comment>');
+        expect(compound1.dataItems.data.value(0)).toBe(`This is an SDF example.\nWith a multi-line comment.`);
+
+        expect(compound1.dataItems.dataHeader.value(1)).toBe('<source>');
+        expect(compound1.dataItems.data.value(1)).toBe('This was retrieved from biotech.fyicenter.com');
+
+        const compound2 = parsed.result.compounds[1];
+        expect(compound2.molFile.atoms.count).toBe(6);
+        expect(compound2.molFile.bonds.count).toBe(5);
+
+        expect(compound2.molFile.atoms.x.value(4)).toBe(0.622);
+        expect(compound2.molFile.atoms.y.value(4)).toBe(-1.8037);
+        expect(compound2.molFile.atoms.z.value(4)).toBe(0);
+        expect(compound2.molFile.atoms.type_symbol.value(4)).toBe('O');
+
+        expect(compound2.molFile.bonds.atomIdxA.value(1)).toBe(1);
+        expect(compound2.molFile.bonds.atomIdxB.value(1)).toBe(3);
+        expect(compound2.molFile.bonds.order.value(1)).toBe(1);
+
+        expect(compound2.dataItems.dataHeader.rowCount).toBe(0);
+        expect(compound2.dataItems.data.rowCount).toBe(0);
+    });
 });
 });

+ 109 - 0
src/mol-io/reader/sdf/parser-v3-util.ts

@@ -0,0 +1,109 @@
+import { Column } from '../../../mol-data/db';
+import { MolFile } from '../mol/parser';
+import { Tokenizer, TokenBuilder, Tokens } from '../common/text/tokenizer';
+import { TokenColumnProvider as TokenColumn } from '../common/text/column/token';
+
+export function isV3(
+    versionLine: string
+): boolean {
+    return versionLine.trim().endsWith('V3000');
+}
+
+export function handleCountsV3(
+    tokenizer: Tokenizer
+): { atomCount: number, bondCount: number } {
+    const atomCount = TokenBuilder.create(tokenizer.data, 1);
+    const bondCount = TokenBuilder.create(tokenizer.data, 1);
+
+    Tokenizer.eatLine(tokenizer); // BEGIN CTAB
+    skipSingleValue(tokenizer); // M
+    skipSingleValue(tokenizer); // V30
+    skipSingleValue(tokenizer); // COUNTS
+
+    addSingleValue(tokenizer, atomCount);
+    addSingleValue(tokenizer, bondCount);
+    Tokenizer.eatLine(tokenizer);
+
+    return {
+        atomCount: TokenColumn(atomCount)(Column.Schema.int).value(0),
+        bondCount: TokenColumn(bondCount)(Column.Schema.int).value(0)
+    };
+}
+
+export function handleAtomsV3(
+    tokenizer: Tokenizer,
+    atomCount: number
+): MolFile['atoms'] {
+    const x = TokenBuilder.create(tokenizer.data, atomCount * 2);
+    const y = TokenBuilder.create(tokenizer.data, atomCount * 2);
+    const z = TokenBuilder.create(tokenizer.data, atomCount * 2);
+    const type_symbol = TokenBuilder.create(tokenizer.data, atomCount * 2);
+
+    for (let i = 0; i < atomCount; ++i) {
+        Tokenizer.markLine(tokenizer);
+        skipSingleValue(tokenizer); // M
+        skipSingleValue(tokenizer); // V30
+        skipSingleValue(tokenizer); // Index
+
+        const { position } = tokenizer;
+        addSingleValue(tokenizer, type_symbol);
+        addSingleValue(tokenizer, x);
+        addSingleValue(tokenizer, y);
+        addSingleValue(tokenizer, z);
+        tokenizer.position = position;
+    }
+    Tokenizer.eatLine(tokenizer); // Previous Line
+    Tokenizer.eatLine(tokenizer); // END ATOM
+
+    return {
+        count: atomCount,
+        x: TokenColumn(x)(Column.Schema.float),
+        y: TokenColumn(y)(Column.Schema.float),
+        z: TokenColumn(z)(Column.Schema.float),
+        type_symbol: TokenColumn(type_symbol)(Column.Schema.str),
+    };
+}
+
+export function handleBondsV3(
+    tokenizer: Tokenizer,
+    bondCount: number
+): MolFile['bonds'] {
+    const atomIdxA = TokenBuilder.create(tokenizer.data, bondCount * 2);
+    const atomIdxB = TokenBuilder.create(tokenizer.data, bondCount * 2);
+    const order = TokenBuilder.create(tokenizer.data, bondCount * 2);
+
+    for (let i = 0; i < bondCount; ++i) {
+        Tokenizer.markLine(tokenizer);
+        skipSingleValue(tokenizer); // M
+        skipSingleValue(tokenizer); // V30
+        skipSingleValue(tokenizer); // Index
+
+        const { position } = tokenizer;
+        addSingleValue(tokenizer, order);
+        addSingleValue(tokenizer, atomIdxA);
+        addSingleValue(tokenizer, atomIdxB);
+        tokenizer.position = position;
+    }
+    Tokenizer.eatLine(tokenizer); // Previous Line
+    Tokenizer.eatLine(tokenizer); // END BOND
+
+    return {
+        count: bondCount,
+        atomIdxA: TokenColumn(atomIdxA)(Column.Schema.float),
+        atomIdxB: TokenColumn(atomIdxB)(Column.Schema.float),
+        order: TokenColumn(order)(Column.Schema.float),
+    };
+}
+
+function skipSingleValue(tokenizer: Tokenizer) {
+    Tokenizer.skipWhitespace(tokenizer);
+    Tokenizer.eatValue(tokenizer);
+}
+
+function addSingleValue(tokenizer: Tokenizer, tokens: Tokens) {
+    const { position: valueStart } = tokenizer;
+    Tokenizer.skipWhitespace(tokenizer);
+    Tokenizer.eatValue(tokenizer);
+    Tokenizer.trim(tokenizer, valueStart, tokenizer.position);
+    TokenBuilder.addUnchecked(tokens, tokenizer.tokenStart, tokenizer.tokenEnd);
+}

+ 14 - 5
src/mol-io/reader/sdf/parser.ts

@@ -11,8 +11,9 @@ import { Task } from '../../../mol-task';
 import { ReaderResult as Result } from '../result';
 import { ReaderResult as Result } from '../result';
 import { Tokenizer, TokenBuilder } from '../common/text/tokenizer';
 import { Tokenizer, TokenBuilder } from '../common/text/tokenizer';
 import { TokenColumnProvider as TokenColumn } from '../common/text/column/token';
 import { TokenColumnProvider as TokenColumn } from '../common/text/column/token';
+import { handleAtomsV3, handleBondsV3, handleCountsV3, isV3 } from './parser-v3-util';
 
 
-/** http://c4.cabrillo.edu/404/ctfile.pdf - page 41 */
+/** http://c4.cabrillo.edu/404/ctfile.pdf - page 41 & 79 */
 
 
 export interface SdfFileCompound {
 export interface SdfFileCompound {
     readonly molFile: MolFile,
     readonly molFile: MolFile,
@@ -66,14 +67,22 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
     };
     };
 }
 }
 
 
+function handleCountsV2(countsAndVersion: string): { atomCount: number, bondCount: number } {
+    return {
+        atomCount: +countsAndVersion.substr(0, 3),
+        bondCount: +countsAndVersion.substr(3, 3)
+    };
+}
+
 function handleMolFile(tokenizer: Tokenizer) {
 function handleMolFile(tokenizer: Tokenizer) {
     const title = Tokenizer.readLine(tokenizer).trim();
     const title = Tokenizer.readLine(tokenizer).trim();
     const program = Tokenizer.readLine(tokenizer).trim();
     const program = Tokenizer.readLine(tokenizer).trim();
     const comment = Tokenizer.readLine(tokenizer).trim();
     const comment = Tokenizer.readLine(tokenizer).trim();
 
 
-    const counts = Tokenizer.readLine(tokenizer);
+    const countsAndVersion = Tokenizer.readLine(tokenizer);
+    const molIsV3 = isV3(countsAndVersion);
 
 
-    const atomCount = +counts.substr(0, 3), bondCount = +counts.substr(3, 3);
+    const { atomCount, bondCount } = molIsV3 ? handleCountsV3(tokenizer) : handleCountsV2(countsAndVersion);
 
 
     if (Number.isNaN(atomCount) || Number.isNaN(bondCount)) {
     if (Number.isNaN(atomCount) || Number.isNaN(bondCount)) {
         // try to skip to next molecule
         // try to skip to next molecule
@@ -84,8 +93,8 @@ function handleMolFile(tokenizer: Tokenizer) {
         return;
         return;
     }
     }
 
 
-    const atoms = handleAtoms(tokenizer, atomCount);
-    const bonds = handleBonds(tokenizer, bondCount);
+    const atoms = molIsV3 ? handleAtomsV3(tokenizer, atomCount) : handleAtoms(tokenizer, atomCount);
+    const bonds = molIsV3 ? handleBondsV3(tokenizer, bondCount) : handleBonds(tokenizer, bondCount);
     const dataItems = handleDataItems(tokenizer);
     const dataItems = handleDataItems(tokenizer);
 
 
     return {
     return {