Ver Fonte

sdf def, reader, spec

JonStargaryen há 4 anos atrás
pai
commit
4f4245b895

+ 149 - 0
src/mol-io/reader/_spec/sdf.spec.ts

@@ -0,0 +1,149 @@
+
+import { parseSdf } from '../sdf/parser';
+
+const SdfString = `
+  Mrv1718007121815122D          
+
+  5  4  0  0  0  0            999 V2000
+    0.0000    0.8250    0.0000 O   0  5  0  0  0  0  0  0  0  0  0  0
+   -0.8250    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.0000   -0.8250    0.0000 O   0  5  0  0  0  0  0  0  0  0  0  0
+    0.0000    0.0000    0.0000 P   0  0  0  0  0  0  0  0  0  0  0  0
+    0.8250    0.0000    0.0000 O   0  5  0  0  0  0  0  0  0  0  0  0
+  4  1  1  0  0  0  0
+  4  2  2  0  0  0  0
+  4  3  1  0  0  0  0
+  4  5  1  0  0  0  0
+M  CHG  3   1  -1   3  -1   5  -1
+M  END
+> <DATABASE_ID>
+DB14523
+
+> <DATABASE_NAME>
+drugbank
+
+> <SMILES>
+[O-]P([O-])([O-])=O
+
+> <INCHI_IDENTIFIER>
+InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3
+
+> <INCHI_KEY>
+NBIIXXVUZAFLBC-UHFFFAOYSA-K
+
+> <FORMULA>
+O4P
+
+> <MOLECULAR_WEIGHT>
+94.9714
+
+> <EXACT_MASS>
+94.95342
+
+> <JCHEM_ACCEPTOR_COUNT>
+4
+
+> <JCHEM_ATOM_COUNT>
+5
+
+> <JCHEM_AVERAGE_POLARIZABILITY>
+4.932162910070488
+
+> <JCHEM_BIOAVAILABILITY>
+1
+
+> <JCHEM_DONOR_COUNT>
+0
+
+> <JCHEM_FORMAL_CHARGE>
+-3
+
+> <JCHEM_GHOSE_FILTER>
+0
+
+> <JCHEM_IUPAC>
+phosphate
+
+> <JCHEM_LOGP>
+-1.0201038226666665
+
+> <JCHEM_MDDR_LIKE_RULE>
+0
+
+> <JCHEM_NUMBER_OF_RINGS>
+0
+
+> <JCHEM_PHYSIOLOGICAL_CHARGE>
+-2
+
+> <JCHEM_PKA>
+6.951626889535468
+
+> <JCHEM_PKA_STRONGEST_ACIDIC>
+1.7961261340181292
+
+> <JCHEM_POLAR_SURFACE_AREA>
+86.25
+
+> <JCHEM_REFRACTIVITY>
+11.2868
+
+> <JCHEM_ROTATABLE_BOND_COUNT>
+0
+
+> <JCHEM_RULE_OF_FIVE>
+1
+
+> <JCHEM_TRADITIONAL_IUPAC>
+phosphate
+
+> <JCHEM_VEBER_RULE>
+0
+
+> <DRUGBANK_ID>
+DB14523
+
+> <DRUG_GROUPS>
+experimental
+
+> <GENERIC_NAME>
+Phosphate ion
+
+> <SYNONYMS>
+Orthophosphate; Phosphate
+
+$$$$`;
+
+describe('sdf reader', () => {
+    it('basic', async () => {
+        const parsed =  await parseSdf(SdfString).run();
+        if (parsed.isError) {
+            throw new Error(parsed.message);
+        }
+        const compound = parsed.result.compounds[0];
+        const { molFile, dataItems } = compound;
+        const { atoms, bonds } = molFile;
+
+        // number of structures
+        expect(atoms.count).toBe(5);
+        expect(bonds.count).toBe(4);
+
+        expect(atoms.x.value(0)).toBeCloseTo(0, 0.001);
+        expect(atoms.y.value(0)).toBeCloseTo(0.8250, 0.0001);
+        expect(atoms.z.value(0)).toBeCloseTo(0, 0.0001);
+        expect(atoms.type_symbol.value(0)).toBe('O');
+
+        expect(bonds.atomIdxA.value(3)).toBe(4);
+        expect(bonds.atomIdxB.value(3)).toBe(5);
+        expect(bonds.order.value(3)).toBe(1);
+
+        expect(dataItems.dataHeader.value(0)).toBe('DATABASE_ID');
+        expect(dataItems.data.value(0)).toBe('DB14523');
+
+        expect(dataItems.dataHeader.value(1)).toBe('DATABASE_NAME');
+        expect(dataItems.data.value(1)).toBe('drugbank');
+
+        expect(dataItems.dataHeader.value(31)).toBe('SYNONYMS');
+        expect(dataItems.data.value(31)).toBe('Orthophosphate; Phosphate');
+    });
+});

+ 2 - 2
src/mol-io/reader/mol/parser.ts

@@ -30,7 +30,7 @@ export interface MolFile {
     }
 }
 
-function handleAtoms(tokenizer: Tokenizer, count: number): MolFile['atoms'] {
+export function handleAtoms(tokenizer: Tokenizer, count: number): MolFile['atoms'] {
     const x = TokenBuilder.create(tokenizer.data, count * 2);
     const y = TokenBuilder.create(tokenizer.data, count * 2);
     const z = TokenBuilder.create(tokenizer.data, count * 2);
@@ -59,7 +59,7 @@ function handleAtoms(tokenizer: Tokenizer, count: number): MolFile['atoms'] {
     };
 }
 
-function handleBonds(tokenizer: Tokenizer, count: number): MolFile['bonds'] {
+export function handleBonds(tokenizer: Tokenizer, count: number): MolFile['bonds'] {
     const atomIdxA = TokenBuilder.create(tokenizer.data, count * 2);
     const atomIdxB = TokenBuilder.create(tokenizer.data, count * 2);
     const order = TokenBuilder.create(tokenizer.data, count * 2);

+ 85 - 0
src/mol-io/reader/sdf/parser.ts

@@ -0,0 +1,85 @@
+/**
+ * Copyright (c) 2020 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Sebastian Bittrich <sebastian.bittrich@rcsb.org>
+ */
+
+import { Column } from '../../../mol-data/db';
+import { MolFile, handleAtoms, handleBonds } from '../mol/parser';
+import { Task } from '../../../mol-task';
+import { ReaderResult as Result } from '../result';
+import { Tokenizer, TokenBuilder } from '../common/text/tokenizer';
+import TokenColumn from '../common/text/column/token';
+
+/** http://c4.cabrillo.edu/404/ctfile.pdf - page 41 */
+export interface SdfFile {
+    readonly compounds: {
+        readonly molFile: MolFile,
+        readonly dataItems: DataItems
+    }[]
+}
+
+interface DataItems {
+    readonly dataHeader: Column<string>,
+    readonly data: Column<string>
+}
+
+function handleDataItems(tokenizer: Tokenizer): DataItems {
+    const dataHeader = TokenBuilder.create(tokenizer.data, 32);
+    const data = TokenBuilder.create(tokenizer.data, 32);
+
+    let sawHeaderToken = false;
+    while (tokenizer.position < tokenizer.length) {
+        const line = Tokenizer.readLine(tokenizer);
+        if (!!line) {
+            if (line.startsWith('> <')) {
+                TokenBuilder.add(dataHeader, tokenizer.tokenStart + 3, tokenizer.tokenEnd - 1);
+                sawHeaderToken = true;
+            } else if (sawHeaderToken) {
+                TokenBuilder.add(data, tokenizer.tokenStart, tokenizer.tokenEnd);
+                sawHeaderToken = false;
+                // TODO can there be multiline values?
+            }
+        } else {
+            sawHeaderToken = false;
+        }
+    }
+
+    return {
+        dataHeader: TokenColumn(dataHeader)(Column.Schema.str),
+        data: TokenColumn(data)(Column.Schema.str)
+    };
+}
+
+function handleMolFile(data: string) {
+    const tokenizer = Tokenizer(data);
+
+    const title = Tokenizer.readLine(tokenizer).trim();
+    const program = Tokenizer.readLine(tokenizer).trim();
+    const comment = Tokenizer.readLine(tokenizer).trim();
+
+    const counts = Tokenizer.readLine(tokenizer);
+
+    const atomCount = +counts.substr(0, 3), bondCount = +counts.substr(3, 3);
+
+    const atoms = handleAtoms(tokenizer, atomCount);
+    const bonds = handleBonds(tokenizer, bondCount);
+    const dataItems = handleDataItems(tokenizer);
+
+    return {
+        molFile: { title, program, comment, atoms, bonds },
+        dataItems
+    };
+}
+
+const delimiter = '$$$$';
+function parseInternal(data: string): Result<SdfFile> {
+    const result: SdfFile = { compounds: data.split(delimiter).map(d => handleMolFile(d)) };
+    return Result.success(result);
+}
+
+export function parseSdf(data: string) {
+    return Task.create<Result<SdfFile>>('Parse Sdf', async () => {
+        return parseInternal(data);
+    });
+}